mlx5_ib_main.c revision 341922
190075Sobrien/*-
2169689Skan * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
390075Sobrien *
490075Sobrien * Redistribution and use in source and binary forms, with or without
5132718Skan * modification, are permitted provided that the following conditions
690075Sobrien * are met:
7132718Skan * 1. Redistributions of source code must retain the above copyright
890075Sobrien *    notice, this list of conditions and the following disclaimer.
990075Sobrien * 2. Redistributions in binary form must reproduce the above copyright
1090075Sobrien *    notice, this list of conditions and the following disclaimer in the
1190075Sobrien *    documentation and/or other materials provided with the distribution.
12132718Skan *
1390075Sobrien * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
1490075Sobrien * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1590075Sobrien * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1690075Sobrien * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
1790075Sobrien * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18132718Skan * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19169689Skan * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20169689Skan * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2190075Sobrien * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2290075Sobrien * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2390075Sobrien * SUCH DAMAGE.
24117395Skan *
25117395Skan * $FreeBSD: stable/11/sys/dev/mlx5/mlx5_ib/mlx5_ib_main.c 341922 2018-12-12 12:02:06Z hselasky $
26117395Skan */
27132718Skan
28117395Skan#include <linux/module.h>
29117395Skan#include <linux/errno.h>
3090075Sobrien#include <linux/pci.h>
3190075Sobrien#include <linux/dma-mapping.h>
32117395Skan#include <linux/slab.h>
3390075Sobrien#if defined(CONFIG_X86)
34107590Sobrien#include <asm/pat.h>
35107590Sobrien#endif
36107590Sobrien#include <linux/sched.h>
37107590Sobrien#include <linux/delay.h>
38107590Sobrien#include <linux/fs.h>
39107590Sobrien#undef inode
40132718Skan#include <rdma/ib_user_verbs.h>
41132718Skan#include <rdma/ib_addr.h>
42132718Skan#include <rdma/ib_cache.h>
43132718Skan#include <dev/mlx5/port.h>
4490075Sobrien#include <dev/mlx5/vport.h>
4590075Sobrien#include <linux/list.h>
4690075Sobrien#include <rdma/ib_smi.h>
4790075Sobrien#include <rdma/ib_umem.h>
4890075Sobrien#include <linux/in.h>
4990075Sobrien#include <linux/etherdevice.h>
5090075Sobrien#include <dev/mlx5/fs.h>
5190075Sobrien#include "mlx5_ib.h"
52169689Skan
53169689Skan#define DRIVER_NAME "mlx5_ib"
54169689Skan#ifndef DRIVER_VERSION
5590075Sobrien#define DRIVER_VERSION "3.4.2"
56117395Skan#endif
5796263Sobrien#define DRIVER_RELDATE	"July 2018"
5890075Sobrien
5990075SobrienMODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
6090075SobrienMODULE_LICENSE("Dual BSD/GPL");
61169689SkanMODULE_DEPEND(mlx5ib, linuxkpi, 1, 1, 1);
62169689SkanMODULE_DEPEND(mlx5ib, mlx5, 1, 1, 1);
6396263SobrienMODULE_DEPEND(mlx5ib, ibcore, 1, 1, 1);
6490075SobrienMODULE_VERSION(mlx5ib, 1);
65169689Skan
66169689Skanstatic int deprecated_prof_sel = 2;
67169689Skanmodule_param_named(prof_sel, deprecated_prof_sel, int, 0444);
68169689SkanMODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
69169689Skan
70169689Skanstatic const char mlx5_version[] =
71132718Skan	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver "
7296263Sobrien	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
73132718Skan
74132718Skanenum {
7596263Sobrien	MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3,
76169689Skan};
7796263Sobrien
78169689Skanstatic enum rdma_link_layer
79169689Skanmlx5_port_type_cap_to_rdma_ll(int port_type_cap)
80169689Skan{
81169689Skan	switch (port_type_cap) {
82169689Skan	case MLX5_CAP_PORT_TYPE_IB:
83169689Skan		return IB_LINK_LAYER_INFINIBAND;
84169689Skan	case MLX5_CAP_PORT_TYPE_ETH:
8596263Sobrien		return IB_LINK_LAYER_ETHERNET;
86	default:
87		return IB_LINK_LAYER_UNSPECIFIED;
88	}
89}
90
91static enum rdma_link_layer
92mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num)
93{
94	struct mlx5_ib_dev *dev = to_mdev(device);
95	int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type);
96
97	return mlx5_port_type_cap_to_rdma_ll(port_type_cap);
98}
99
100static bool mlx5_netdev_match(struct net_device *ndev,
101			      struct mlx5_core_dev *mdev,
102			      const char *dname)
103{
104	return ndev->if_type == IFT_ETHER &&
105	  ndev->if_dname != NULL &&
106	  strcmp(ndev->if_dname, dname) == 0 &&
107	  ndev->if_softc != NULL &&
108	  *(struct mlx5_core_dev **)ndev->if_softc == mdev;
109}
110
111static int mlx5_netdev_event(struct notifier_block *this,
112			     unsigned long event, void *ptr)
113{
114	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
115	struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev,
116						 roce.nb);
117
118	switch (event) {
119	case NETDEV_REGISTER:
120	case NETDEV_UNREGISTER:
121		write_lock(&ibdev->roce.netdev_lock);
122		/* check if network interface belongs to mlx5en */
123		if (mlx5_netdev_match(ndev, ibdev->mdev, "mce"))
124			ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ?
125					     NULL : ndev;
126		write_unlock(&ibdev->roce.netdev_lock);
127		break;
128
129	case NETDEV_UP:
130	case NETDEV_DOWN: {
131		struct net_device *upper = NULL;
132
133		if ((upper == ndev || (!upper && ndev == ibdev->roce.netdev))
134		    && ibdev->ib_active) {
135			struct ib_event ibev = {0};
136
137			ibev.device = &ibdev->ib_dev;
138			ibev.event = (event == NETDEV_UP) ?
139				     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
140			ibev.element.port_num = 1;
141			ib_dispatch_event(&ibev);
142		}
143		break;
144	}
145
146	default:
147		break;
148	}
149
150	return NOTIFY_DONE;
151}
152
153static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
154					     u8 port_num)
155{
156	struct mlx5_ib_dev *ibdev = to_mdev(device);
157	struct net_device *ndev;
158
159	/* Ensure ndev does not disappear before we invoke dev_hold()
160	 */
161	read_lock(&ibdev->roce.netdev_lock);
162	ndev = ibdev->roce.netdev;
163	if (ndev)
164		dev_hold(ndev);
165	read_unlock(&ibdev->roce.netdev_lock);
166
167	return ndev;
168}
169
170static int translate_eth_proto_oper(u32 eth_proto_oper, u8 *active_speed,
171				    u8 *active_width)
172{
173	switch (eth_proto_oper) {
174	case MLX5E_PROT_MASK(MLX5E_1000BASE_CX_SGMII):
175	case MLX5E_PROT_MASK(MLX5E_1000BASE_KX):
176	case MLX5E_PROT_MASK(MLX5E_100BASE_TX):
177	case MLX5E_PROT_MASK(MLX5E_1000BASE_T):
178		*active_width = IB_WIDTH_1X;
179		*active_speed = IB_SPEED_SDR;
180		break;
181	case MLX5E_PROT_MASK(MLX5E_10GBASE_T):
182	case MLX5E_PROT_MASK(MLX5E_10GBASE_CX4):
183	case MLX5E_PROT_MASK(MLX5E_10GBASE_KX4):
184	case MLX5E_PROT_MASK(MLX5E_10GBASE_KR):
185	case MLX5E_PROT_MASK(MLX5E_10GBASE_CR):
186	case MLX5E_PROT_MASK(MLX5E_10GBASE_SR):
187	case MLX5E_PROT_MASK(MLX5E_10GBASE_ER):
188		*active_width = IB_WIDTH_1X;
189		*active_speed = IB_SPEED_QDR;
190		break;
191	case MLX5E_PROT_MASK(MLX5E_25GBASE_CR):
192	case MLX5E_PROT_MASK(MLX5E_25GBASE_KR):
193	case MLX5E_PROT_MASK(MLX5E_25GBASE_SR):
194		*active_width = IB_WIDTH_1X;
195		*active_speed = IB_SPEED_EDR;
196		break;
197	case MLX5E_PROT_MASK(MLX5E_40GBASE_CR4):
198	case MLX5E_PROT_MASK(MLX5E_40GBASE_KR4):
199	case MLX5E_PROT_MASK(MLX5E_40GBASE_SR4):
200	case MLX5E_PROT_MASK(MLX5E_40GBASE_LR4):
201		*active_width = IB_WIDTH_4X;
202		*active_speed = IB_SPEED_QDR;
203		break;
204	case MLX5E_PROT_MASK(MLX5E_50GBASE_CR2):
205	case MLX5E_PROT_MASK(MLX5E_50GBASE_KR2):
206	case MLX5E_PROT_MASK(MLX5E_50GBASE_SR2):
207		*active_width = IB_WIDTH_1X;
208		*active_speed = IB_SPEED_HDR;
209		break;
210	case MLX5E_PROT_MASK(MLX5E_56GBASE_R4):
211		*active_width = IB_WIDTH_4X;
212		*active_speed = IB_SPEED_FDR;
213		break;
214	case MLX5E_PROT_MASK(MLX5E_100GBASE_CR4):
215	case MLX5E_PROT_MASK(MLX5E_100GBASE_SR4):
216	case MLX5E_PROT_MASK(MLX5E_100GBASE_KR4):
217	case MLX5E_PROT_MASK(MLX5E_100GBASE_LR4):
218		*active_width = IB_WIDTH_4X;
219		*active_speed = IB_SPEED_EDR;
220		break;
221	default:
222		return -EINVAL;
223	}
224
225	return 0;
226}
227
228static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
229				struct ib_port_attr *props)
230{
231	struct mlx5_ib_dev *dev = to_mdev(device);
232	struct net_device *ndev;
233	enum ib_mtu ndev_ib_mtu;
234	u16 qkey_viol_cntr;
235	u32 eth_prot_oper;
236	int err;
237
238	memset(props, 0, sizeof(*props));
239
240	/* Possible bad flows are checked before filling out props so in case
241	 * of an error it will still be zeroed out.
242	 */
243	err = mlx5_query_port_eth_proto_oper(dev->mdev, &eth_prot_oper, port_num);
244	if (err)
245		return err;
246
247	translate_eth_proto_oper(eth_prot_oper, &props->active_speed,
248				 &props->active_width);
249
250	props->port_cap_flags  |= IB_PORT_CM_SUP;
251	props->port_cap_flags  |= IB_PORT_IP_BASED_GIDS;
252
253	props->gid_tbl_len      = MLX5_CAP_ROCE(dev->mdev,
254						roce_address_table_size);
255	props->max_mtu          = IB_MTU_4096;
256	props->max_msg_sz       = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg);
257	props->pkey_tbl_len     = 1;
258	props->state            = IB_PORT_DOWN;
259	props->phys_state       = 3;
260
261	mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr);
262	props->qkey_viol_cntr = qkey_viol_cntr;
263
264	ndev = mlx5_ib_get_netdev(device, port_num);
265	if (!ndev)
266		return 0;
267
268	if (netif_running(ndev) && netif_carrier_ok(ndev)) {
269		props->state      = IB_PORT_ACTIVE;
270		props->phys_state = 5;
271	}
272
273	ndev_ib_mtu = iboe_get_mtu(ndev->if_mtu);
274
275	dev_put(ndev);
276
277	props->active_mtu	= min(props->max_mtu, ndev_ib_mtu);
278	return 0;
279}
280
281static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid,
282				     const struct ib_gid_attr *attr,
283				     void *mlx5_addr)
284{
285#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v)
286	char *mlx5_addr_l3_addr	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
287					       source_l3_address);
288	void *mlx5_addr_mac	= MLX5_ADDR_OF(roce_addr_layout, mlx5_addr,
289					       source_mac_47_32);
290	u16 vlan_id;
291
292	if (!gid)
293		return;
294	ether_addr_copy(mlx5_addr_mac, IF_LLADDR(attr->ndev));
295
296	vlan_id = rdma_vlan_dev_vlan_id(attr->ndev);
297	if (vlan_id != 0xffff) {
298		MLX5_SET_RA(mlx5_addr, vlan_valid, 1);
299		MLX5_SET_RA(mlx5_addr, vlan_id, vlan_id);
300	}
301
302	switch (attr->gid_type) {
303	case IB_GID_TYPE_IB:
304		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1);
305		break;
306	case IB_GID_TYPE_ROCE_UDP_ENCAP:
307		MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2);
308		break;
309
310	default:
311		WARN_ON(true);
312	}
313
314	if (attr->gid_type != IB_GID_TYPE_IB) {
315		if (ipv6_addr_v4mapped((void *)gid))
316			MLX5_SET_RA(mlx5_addr, roce_l3_type,
317				    MLX5_ROCE_L3_TYPE_IPV4);
318		else
319			MLX5_SET_RA(mlx5_addr, roce_l3_type,
320				    MLX5_ROCE_L3_TYPE_IPV6);
321	}
322
323	if ((attr->gid_type == IB_GID_TYPE_IB) ||
324	    !ipv6_addr_v4mapped((void *)gid))
325		memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid));
326	else
327		memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4);
328}
329
330static int set_roce_addr(struct ib_device *device, u8 port_num,
331			 unsigned int index,
332			 const union ib_gid *gid,
333			 const struct ib_gid_attr *attr)
334{
335	struct mlx5_ib_dev *dev = to_mdev(device);
336	u32  in[MLX5_ST_SZ_DW(set_roce_address_in)]  = {0};
337	u32 out[MLX5_ST_SZ_DW(set_roce_address_out)] = {0};
338	void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address);
339	enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num);
340
341	if (ll != IB_LINK_LAYER_ETHERNET)
342		return -EINVAL;
343
344	ib_gid_to_mlx5_roce_addr(gid, attr, in_addr);
345
346	MLX5_SET(set_roce_address_in, in, roce_address_index, index);
347	MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS);
348	return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out));
349}
350
351static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num,
352			   unsigned int index, const union ib_gid *gid,
353			   const struct ib_gid_attr *attr,
354			   __always_unused void **context)
355{
356	return set_roce_addr(device, port_num, index, gid, attr);
357}
358
359static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num,
360			   unsigned int index, __always_unused void **context)
361{
362	return set_roce_addr(device, port_num, index, NULL, NULL);
363}
364
365__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
366			       int index)
367{
368	struct ib_gid_attr attr;
369	union ib_gid gid;
370
371	if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr))
372		return 0;
373
374	if (!attr.ndev)
375		return 0;
376
377	dev_put(attr.ndev);
378
379	if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP)
380		return 0;
381
382	return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port));
383}
384
385int mlx5_get_roce_gid_type(struct mlx5_ib_dev *dev, u8 port_num,
386			   int index, enum ib_gid_type *gid_type)
387{
388	struct ib_gid_attr attr;
389	union ib_gid gid;
390	int ret;
391
392	ret = ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr);
393	if (ret)
394		return ret;
395
396	if (!attr.ndev)
397		return -ENODEV;
398
399	dev_put(attr.ndev);
400
401	*gid_type = attr.gid_type;
402
403	return 0;
404}
405
406static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
407{
408	if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
409		return !MLX5_CAP_GEN(dev->mdev, ib_virt);
410	return 0;
411}
412
413enum {
414	MLX5_VPORT_ACCESS_METHOD_MAD,
415	MLX5_VPORT_ACCESS_METHOD_HCA,
416	MLX5_VPORT_ACCESS_METHOD_NIC,
417};
418
419static int mlx5_get_vport_access_method(struct ib_device *ibdev)
420{
421	if (mlx5_use_mad_ifc(to_mdev(ibdev)))
422		return MLX5_VPORT_ACCESS_METHOD_MAD;
423
424	if (mlx5_ib_port_link_layer(ibdev, 1) ==
425	    IB_LINK_LAYER_ETHERNET)
426		return MLX5_VPORT_ACCESS_METHOD_NIC;
427
428	return MLX5_VPORT_ACCESS_METHOD_HCA;
429}
430
431static void get_atomic_caps(struct mlx5_ib_dev *dev,
432			    struct ib_device_attr *props)
433{
434	u8 tmp;
435	u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations);
436	u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp);
437	u8 atomic_req_8B_endianness_mode =
438		MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode);
439
440	/* Check if HW supports 8 bytes standard atomic operations and capable
441	 * of host endianness respond
442	 */
443	tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD;
444	if (((atomic_operations & tmp) == tmp) &&
445	    (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) &&
446	    (atomic_req_8B_endianness_mode)) {
447		props->atomic_cap = IB_ATOMIC_HCA;
448	} else {
449		props->atomic_cap = IB_ATOMIC_NONE;
450	}
451}
452
453static int mlx5_query_system_image_guid(struct ib_device *ibdev,
454					__be64 *sys_image_guid)
455{
456	struct mlx5_ib_dev *dev = to_mdev(ibdev);
457	struct mlx5_core_dev *mdev = dev->mdev;
458	u64 tmp;
459	int err;
460
461	switch (mlx5_get_vport_access_method(ibdev)) {
462	case MLX5_VPORT_ACCESS_METHOD_MAD:
463		return mlx5_query_mad_ifc_system_image_guid(ibdev,
464							    sys_image_guid);
465
466	case MLX5_VPORT_ACCESS_METHOD_HCA:
467		err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp);
468		break;
469
470	case MLX5_VPORT_ACCESS_METHOD_NIC:
471		err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp);
472		break;
473
474	default:
475		return -EINVAL;
476	}
477
478	if (!err)
479		*sys_image_guid = cpu_to_be64(tmp);
480
481	return err;
482
483}
484
485static int mlx5_query_max_pkeys(struct ib_device *ibdev,
486				u16 *max_pkeys)
487{
488	struct mlx5_ib_dev *dev = to_mdev(ibdev);
489	struct mlx5_core_dev *mdev = dev->mdev;
490
491	switch (mlx5_get_vport_access_method(ibdev)) {
492	case MLX5_VPORT_ACCESS_METHOD_MAD:
493		return mlx5_query_mad_ifc_max_pkeys(ibdev, max_pkeys);
494
495	case MLX5_VPORT_ACCESS_METHOD_HCA:
496	case MLX5_VPORT_ACCESS_METHOD_NIC:
497		*max_pkeys = mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev,
498						pkey_table_size));
499		return 0;
500
501	default:
502		return -EINVAL;
503	}
504}
505
506static int mlx5_query_vendor_id(struct ib_device *ibdev,
507				u32 *vendor_id)
508{
509	struct mlx5_ib_dev *dev = to_mdev(ibdev);
510
511	switch (mlx5_get_vport_access_method(ibdev)) {
512	case MLX5_VPORT_ACCESS_METHOD_MAD:
513		return mlx5_query_mad_ifc_vendor_id(ibdev, vendor_id);
514
515	case MLX5_VPORT_ACCESS_METHOD_HCA:
516	case MLX5_VPORT_ACCESS_METHOD_NIC:
517		return mlx5_core_query_vendor_id(dev->mdev, vendor_id);
518
519	default:
520		return -EINVAL;
521	}
522}
523
524static int mlx5_query_node_guid(struct mlx5_ib_dev *dev,
525				__be64 *node_guid)
526{
527	u64 tmp;
528	int err;
529
530	switch (mlx5_get_vport_access_method(&dev->ib_dev)) {
531	case MLX5_VPORT_ACCESS_METHOD_MAD:
532		return mlx5_query_mad_ifc_node_guid(dev, node_guid);
533
534	case MLX5_VPORT_ACCESS_METHOD_HCA:
535		err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp);
536		break;
537
538	case MLX5_VPORT_ACCESS_METHOD_NIC:
539		err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp);
540		break;
541
542	default:
543		return -EINVAL;
544	}
545
546	if (!err)
547		*node_guid = cpu_to_be64(tmp);
548
549	return err;
550}
551
552struct mlx5_reg_node_desc {
553	u8	desc[IB_DEVICE_NODE_DESC_MAX];
554};
555
556static int mlx5_query_node_desc(struct mlx5_ib_dev *dev, char *node_desc)
557{
558	struct mlx5_reg_node_desc in;
559
560	if (mlx5_use_mad_ifc(dev))
561		return mlx5_query_mad_ifc_node_desc(dev, node_desc);
562
563	memset(&in, 0, sizeof(in));
564
565	return mlx5_core_access_reg(dev->mdev, &in, sizeof(in), node_desc,
566				    sizeof(struct mlx5_reg_node_desc),
567				    MLX5_REG_NODE_DESC, 0, 0);
568}
569
570static int mlx5_ib_query_device(struct ib_device *ibdev,
571				struct ib_device_attr *props,
572				struct ib_udata *uhw)
573{
574	struct mlx5_ib_dev *dev = to_mdev(ibdev);
575	struct mlx5_core_dev *mdev = dev->mdev;
576	int err = -ENOMEM;
577	int max_rq_sg;
578	int max_sq_sg;
579	u64 min_page_size = 1ull << MLX5_CAP_GEN(mdev, log_pg_sz);
580	struct mlx5_ib_query_device_resp resp = {};
581	size_t resp_len;
582	u64 max_tso;
583
584	resp_len = sizeof(resp.comp_mask) + sizeof(resp.response_length);
585	if (uhw->outlen && uhw->outlen < resp_len)
586		return -EINVAL;
587	else
588		resp.response_length = resp_len;
589
590	if (uhw->inlen && !ib_is_udata_cleared(uhw, 0, uhw->inlen))
591		return -EINVAL;
592
593	memset(props, 0, sizeof(*props));
594	err = mlx5_query_system_image_guid(ibdev,
595					   &props->sys_image_guid);
596	if (err)
597		return err;
598
599	err = mlx5_query_max_pkeys(ibdev, &props->max_pkeys);
600	if (err)
601		return err;
602
603	err = mlx5_query_vendor_id(ibdev, &props->vendor_id);
604	if (err)
605		return err;
606
607	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
608		(fw_rev_min(dev->mdev) << 16) |
609		fw_rev_sub(dev->mdev);
610	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
611		IB_DEVICE_PORT_ACTIVE_EVENT		|
612		IB_DEVICE_SYS_IMAGE_GUID		|
613		IB_DEVICE_RC_RNR_NAK_GEN;
614
615	if (MLX5_CAP_GEN(mdev, pkv))
616		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
617	if (MLX5_CAP_GEN(mdev, qkv))
618		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
619	if (MLX5_CAP_GEN(mdev, apm))
620		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
621	if (MLX5_CAP_GEN(mdev, xrc))
622		props->device_cap_flags |= IB_DEVICE_XRC;
623	if (MLX5_CAP_GEN(mdev, imaicl)) {
624		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW |
625					   IB_DEVICE_MEM_WINDOW_TYPE_2B;
626		props->max_mw = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
627		/* We support 'Gappy' memory registration too */
628		props->device_cap_flags |= IB_DEVICE_SG_GAPS_REG;
629	}
630	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
631	if (MLX5_CAP_GEN(mdev, sho)) {
632		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
633		/* At this stage no support for signature handover */
634		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
635				      IB_PROT_T10DIF_TYPE_2 |
636				      IB_PROT_T10DIF_TYPE_3;
637		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
638				       IB_GUARD_T10DIF_CSUM;
639	}
640	if (MLX5_CAP_GEN(mdev, block_lb_mc))
641		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
642
643	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads)) {
644		if (MLX5_CAP_ETH(mdev, csum_cap))
645			props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM;
646
647		if (field_avail(typeof(resp), tso_caps, uhw->outlen)) {
648			max_tso = MLX5_CAP_ETH(mdev, max_lso_cap);
649			if (max_tso) {
650				resp.tso_caps.max_tso = 1 << max_tso;
651				resp.tso_caps.supported_qpts |=
652					1 << IB_QPT_RAW_PACKET;
653				resp.response_length += sizeof(resp.tso_caps);
654			}
655		}
656
657		if (field_avail(typeof(resp), rss_caps, uhw->outlen)) {
658			resp.rss_caps.rx_hash_function =
659						MLX5_RX_HASH_FUNC_TOEPLITZ;
660			resp.rss_caps.rx_hash_fields_mask =
661						MLX5_RX_HASH_SRC_IPV4 |
662						MLX5_RX_HASH_DST_IPV4 |
663						MLX5_RX_HASH_SRC_IPV6 |
664						MLX5_RX_HASH_DST_IPV6 |
665						MLX5_RX_HASH_SRC_PORT_TCP |
666						MLX5_RX_HASH_DST_PORT_TCP |
667						MLX5_RX_HASH_SRC_PORT_UDP |
668						MLX5_RX_HASH_DST_PORT_UDP;
669			resp.response_length += sizeof(resp.rss_caps);
670		}
671	} else {
672		if (field_avail(typeof(resp), tso_caps, uhw->outlen))
673			resp.response_length += sizeof(resp.tso_caps);
674		if (field_avail(typeof(resp), rss_caps, uhw->outlen))
675			resp.response_length += sizeof(resp.rss_caps);
676	}
677
678	if (MLX5_CAP_GEN(mdev, ipoib_ipoib_offloads)) {
679		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
680		props->device_cap_flags |= IB_DEVICE_UD_TSO;
681	}
682
683	if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) &&
684	    MLX5_CAP_ETH(dev->mdev, scatter_fcs))
685		props->device_cap_flags |= IB_DEVICE_RAW_SCATTER_FCS;
686
687	if (mlx5_get_flow_namespace(dev->mdev, MLX5_FLOW_NAMESPACE_BYPASS))
688		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
689
690	props->vendor_part_id	   = mdev->pdev->device;
691	props->hw_ver		   = mdev->pdev->revision;
692
693	props->max_mr_size	   = ~0ull;
694	props->page_size_cap	   = ~(min_page_size - 1);
695	props->max_qp		   = 1 << MLX5_CAP_GEN(mdev, log_max_qp);
696	props->max_qp_wr	   = 1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
697	max_rq_sg =  MLX5_CAP_GEN(mdev, max_wqe_sz_rq) /
698		     sizeof(struct mlx5_wqe_data_seg);
699	max_sq_sg = (MLX5_CAP_GEN(mdev, max_wqe_sz_sq) -
700		     sizeof(struct mlx5_wqe_ctrl_seg)) /
701		     sizeof(struct mlx5_wqe_data_seg);
702	props->max_sge = min(max_rq_sg, max_sq_sg);
703	props->max_sge_rd	   = MLX5_MAX_SGE_RD;
704	props->max_cq		   = 1 << MLX5_CAP_GEN(mdev, log_max_cq);
705	props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1;
706	props->max_mr		   = 1 << MLX5_CAP_GEN(mdev, log_max_mkey);
707	props->max_pd		   = 1 << MLX5_CAP_GEN(mdev, log_max_pd);
708	props->max_qp_rd_atom	   = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp);
709	props->max_qp_init_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_res_qp);
710	props->max_srq		   = 1 << MLX5_CAP_GEN(mdev, log_max_srq);
711	props->max_srq_wr = (1 << MLX5_CAP_GEN(mdev, log_max_srq_sz)) - 1;
712	props->local_ca_ack_delay  = MLX5_CAP_GEN(mdev, local_ca_ack_delay);
713	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
714	props->max_srq_sge	   = max_rq_sg - 1;
715	props->max_fast_reg_page_list_len =
716		1 << MLX5_CAP_GEN(mdev, log_max_klm_list_size);
717	get_atomic_caps(dev, props);
718	props->masked_atomic_cap   = IB_ATOMIC_NONE;
719	props->max_mcast_grp	   = 1 << MLX5_CAP_GEN(mdev, log_max_mcg);
720	props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg);
721	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
722					   props->max_mcast_grp;
723	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
724	props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
725	props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
726
727#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
728	if (MLX5_CAP_GEN(mdev, pg))
729		props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
730	props->odp_caps = dev->odp_caps;
731#endif
732
733	if (MLX5_CAP_GEN(mdev, cd))
734		props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
735
736	if (!mlx5_core_is_pf(mdev))
737		props->device_cap_flags |= IB_DEVICE_VIRTUAL_FUNCTION;
738
739	if (mlx5_ib_port_link_layer(ibdev, 1) ==
740	    IB_LINK_LAYER_ETHERNET) {
741		props->rss_caps.max_rwq_indirection_tables =
742			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt);
743		props->rss_caps.max_rwq_indirection_table_size =
744			1 << MLX5_CAP_GEN(dev->mdev, log_max_rqt_size);
745		props->rss_caps.supported_qpts = 1 << IB_QPT_RAW_PACKET;
746		props->max_wq_type_rq =
747			1 << MLX5_CAP_GEN(dev->mdev, log_max_rq);
748	}
749
750	if (uhw->outlen) {
751		err = ib_copy_to_udata(uhw, &resp, resp.response_length);
752
753		if (err)
754			return err;
755	}
756
757	return 0;
758}
759
760enum mlx5_ib_width {
761	MLX5_IB_WIDTH_1X	= 1 << 0,
762	MLX5_IB_WIDTH_2X	= 1 << 1,
763	MLX5_IB_WIDTH_4X	= 1 << 2,
764	MLX5_IB_WIDTH_8X	= 1 << 3,
765	MLX5_IB_WIDTH_12X	= 1 << 4
766};
767
768static int translate_active_width(struct ib_device *ibdev, u8 active_width,
769				  u8 *ib_width)
770{
771	struct mlx5_ib_dev *dev = to_mdev(ibdev);
772	int err = 0;
773
774	if (active_width & MLX5_IB_WIDTH_1X) {
775		*ib_width = IB_WIDTH_1X;
776	} else if (active_width & MLX5_IB_WIDTH_2X) {
777		mlx5_ib_dbg(dev, "active_width %d is not supported by IB spec\n",
778			    (int)active_width);
779		err = -EINVAL;
780	} else if (active_width & MLX5_IB_WIDTH_4X) {
781		*ib_width = IB_WIDTH_4X;
782	} else if (active_width & MLX5_IB_WIDTH_8X) {
783		*ib_width = IB_WIDTH_8X;
784	} else if (active_width & MLX5_IB_WIDTH_12X) {
785		*ib_width = IB_WIDTH_12X;
786	} else {
787		mlx5_ib_dbg(dev, "Invalid active_width %d\n",
788			    (int)active_width);
789		err = -EINVAL;
790	}
791
792	return err;
793}
794
795enum ib_max_vl_num {
796	__IB_MAX_VL_0		= 1,
797	__IB_MAX_VL_0_1		= 2,
798	__IB_MAX_VL_0_3		= 3,
799	__IB_MAX_VL_0_7		= 4,
800	__IB_MAX_VL_0_14	= 5,
801};
802
803enum mlx5_vl_hw_cap {
804	MLX5_VL_HW_0	= 1,
805	MLX5_VL_HW_0_1	= 2,
806	MLX5_VL_HW_0_2	= 3,
807	MLX5_VL_HW_0_3	= 4,
808	MLX5_VL_HW_0_4	= 5,
809	MLX5_VL_HW_0_5	= 6,
810	MLX5_VL_HW_0_6	= 7,
811	MLX5_VL_HW_0_7	= 8,
812	MLX5_VL_HW_0_14	= 15
813};
814
815static int translate_max_vl_num(struct ib_device *ibdev, u8 vl_hw_cap,
816				u8 *max_vl_num)
817{
818	switch (vl_hw_cap) {
819	case MLX5_VL_HW_0:
820		*max_vl_num = __IB_MAX_VL_0;
821		break;
822	case MLX5_VL_HW_0_1:
823		*max_vl_num = __IB_MAX_VL_0_1;
824		break;
825	case MLX5_VL_HW_0_3:
826		*max_vl_num = __IB_MAX_VL_0_3;
827		break;
828	case MLX5_VL_HW_0_7:
829		*max_vl_num = __IB_MAX_VL_0_7;
830		break;
831	case MLX5_VL_HW_0_14:
832		*max_vl_num = __IB_MAX_VL_0_14;
833		break;
834
835	default:
836		return -EINVAL;
837	}
838
839	return 0;
840}
841
842static int mlx5_query_hca_port(struct ib_device *ibdev, u8 port,
843			       struct ib_port_attr *props)
844{
845	struct mlx5_ib_dev *dev = to_mdev(ibdev);
846	struct mlx5_core_dev *mdev = dev->mdev;
847	u32 *rep;
848	int replen = MLX5_ST_SZ_BYTES(query_hca_vport_context_out);
849	struct mlx5_ptys_reg *ptys;
850	struct mlx5_pmtu_reg *pmtu;
851	struct mlx5_pvlc_reg pvlc;
852	void *ctx;
853	int err;
854
855	rep = mlx5_vzalloc(replen);
856	ptys = kzalloc(sizeof(*ptys), GFP_KERNEL);
857	pmtu = kzalloc(sizeof(*pmtu), GFP_KERNEL);
858	if (!rep || !ptys || !pmtu) {
859		err = -ENOMEM;
860		goto out;
861	}
862
863	memset(props, 0, sizeof(*props));
864
865	err = mlx5_query_hca_vport_context(mdev, port, 0, rep, replen);
866	if (err)
867		goto out;
868
869	ctx = MLX5_ADDR_OF(query_hca_vport_context_out, rep, hca_vport_context);
870
871	props->lid		= MLX5_GET(hca_vport_context, ctx, lid);
872	props->lmc		= MLX5_GET(hca_vport_context, ctx, lmc);
873	props->sm_lid		= MLX5_GET(hca_vport_context, ctx, sm_lid);
874	props->sm_sl		= MLX5_GET(hca_vport_context, ctx, sm_sl);
875	props->state		= MLX5_GET(hca_vport_context, ctx, vport_state);
876	props->phys_state	= MLX5_GET(hca_vport_context, ctx,
877					port_physical_state);
878	props->port_cap_flags	= MLX5_GET(hca_vport_context, ctx, cap_mask1);
879	props->gid_tbl_len	= mlx5_get_gid_table_len(MLX5_CAP_GEN(mdev, gid_table_size));
880	props->max_msg_sz	= 1 << MLX5_CAP_GEN(mdev, log_max_msg);
881	props->pkey_tbl_len	= mlx5_to_sw_pkey_sz(MLX5_CAP_GEN(mdev, pkey_table_size));
882	props->bad_pkey_cntr	= MLX5_GET(hca_vport_context, ctx,
883					pkey_violation_counter);
884	props->qkey_viol_cntr	= MLX5_GET(hca_vport_context, ctx,
885					qkey_violation_counter);
886	props->subnet_timeout	= MLX5_GET(hca_vport_context, ctx,
887					subnet_timeout);
888	props->init_type_reply	= MLX5_GET(hca_vport_context, ctx,
889					init_type_reply);
890	props->grh_required	= MLX5_GET(hca_vport_context, ctx, grh_required);
891
892	ptys->proto_mask |= MLX5_PTYS_IB;
893	ptys->local_port = port;
894	err = mlx5_core_access_ptys(mdev, ptys, 0);
895	if (err)
896		goto out;
897
898	err = translate_active_width(ibdev, ptys->ib_link_width_oper,
899				     &props->active_width);
900	if (err)
901		goto out;
902
903	props->active_speed	= (u8)ptys->ib_proto_oper;
904
905	pmtu->local_port = port;
906	err = mlx5_core_access_pmtu(mdev, pmtu, 0);
907	if (err)
908		goto out;
909
910	props->max_mtu		= pmtu->max_mtu;
911	props->active_mtu	= pmtu->oper_mtu;
912
913	memset(&pvlc, 0, sizeof(pvlc));
914	pvlc.local_port = port;
915	err = mlx5_core_access_pvlc(mdev, &pvlc, 0);
916	if (err)
917		goto out;
918
919	err = translate_max_vl_num(ibdev, pvlc.vl_hw_cap,
920				   &props->max_vl_num);
921out:
922	kvfree(rep);
923	kfree(ptys);
924	kfree(pmtu);
925	return err;
926}
927
928int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
929		       struct ib_port_attr *props)
930{
931	switch (mlx5_get_vport_access_method(ibdev)) {
932	case MLX5_VPORT_ACCESS_METHOD_MAD:
933		return mlx5_query_mad_ifc_port(ibdev, port, props);
934
935	case MLX5_VPORT_ACCESS_METHOD_HCA:
936		return mlx5_query_hca_port(ibdev, port, props);
937
938	case MLX5_VPORT_ACCESS_METHOD_NIC:
939		return mlx5_query_port_roce(ibdev, port, props);
940
941	default:
942		return -EINVAL;
943	}
944}
945
946static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
947			     union ib_gid *gid)
948{
949	struct mlx5_ib_dev *dev = to_mdev(ibdev);
950	struct mlx5_core_dev *mdev = dev->mdev;
951
952	switch (mlx5_get_vport_access_method(ibdev)) {
953	case MLX5_VPORT_ACCESS_METHOD_MAD:
954		return mlx5_query_mad_ifc_gids(ibdev, port, index, gid);
955
956	case MLX5_VPORT_ACCESS_METHOD_HCA:
957		return mlx5_query_hca_vport_gid(mdev, port, 0, index, gid);
958
959	default:
960		return -EINVAL;
961	}
962
963}
964
965static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
966			      u16 *pkey)
967{
968	struct mlx5_ib_dev *dev = to_mdev(ibdev);
969	struct mlx5_core_dev *mdev = dev->mdev;
970
971	switch (mlx5_get_vport_access_method(ibdev)) {
972	case MLX5_VPORT_ACCESS_METHOD_MAD:
973		return mlx5_query_mad_ifc_pkey(ibdev, port, index, pkey);
974
975	case MLX5_VPORT_ACCESS_METHOD_HCA:
976	case MLX5_VPORT_ACCESS_METHOD_NIC:
977		return mlx5_query_hca_vport_pkey(mdev, 0, port,  0, index,
978						 pkey);
979	default:
980		return -EINVAL;
981	}
982}
983
984static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
985				 struct ib_device_modify *props)
986{
987	struct mlx5_ib_dev *dev = to_mdev(ibdev);
988	struct mlx5_reg_node_desc in;
989	struct mlx5_reg_node_desc out;
990	int err;
991
992	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
993		return -EOPNOTSUPP;
994
995	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
996		return 0;
997
998	/*
999	 * If possible, pass node desc to FW, so it can generate
1000	 * a 144 trap.  If cmd fails, just ignore.
1001	 */
1002	memcpy(&in, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1003	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
1004				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
1005	if (err)
1006		return err;
1007
1008	memcpy(ibdev->node_desc, props->node_desc, IB_DEVICE_NODE_DESC_MAX);
1009
1010	return err;
1011}
1012
1013static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
1014			       struct ib_port_modify *props)
1015{
1016	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1017	struct ib_port_attr attr;
1018	u32 tmp;
1019	int err;
1020
1021	mutex_lock(&dev->cap_mask_mutex);
1022
1023	err = mlx5_ib_query_port(ibdev, port, &attr);
1024	if (err)
1025		goto out;
1026
1027	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
1028		~props->clr_port_cap_mask;
1029
1030	err = mlx5_set_port_caps(dev->mdev, port, tmp);
1031
1032out:
1033	mutex_unlock(&dev->cap_mask_mutex);
1034	return err;
1035}
1036
1037static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
1038						  struct ib_udata *udata)
1039{
1040	struct mlx5_ib_dev *dev = to_mdev(ibdev);
1041	struct mlx5_ib_alloc_ucontext_req_v2 req = {};
1042	struct mlx5_ib_alloc_ucontext_resp resp = {};
1043	struct mlx5_ib_ucontext *context;
1044	struct mlx5_uuar_info *uuari;
1045	struct mlx5_uar *uars;
1046	int gross_uuars;
1047	int num_uars;
1048	int ver;
1049	int uuarn;
1050	int err;
1051	int i;
1052	size_t reqlen;
1053	size_t min_req_v2 = offsetof(struct mlx5_ib_alloc_ucontext_req_v2,
1054				     max_cqe_version);
1055
1056	if (!dev->ib_active)
1057		return ERR_PTR(-EAGAIN);
1058
1059	if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr))
1060		return ERR_PTR(-EINVAL);
1061
1062	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
1063	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
1064		ver = 0;
1065	else if (reqlen >= min_req_v2)
1066		ver = 2;
1067	else
1068		return ERR_PTR(-EINVAL);
1069
1070	err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req)));
1071	if (err)
1072		return ERR_PTR(err);
1073
1074	if (req.flags)
1075		return ERR_PTR(-EINVAL);
1076
1077	if (req.total_num_uuars > MLX5_MAX_UUARS)
1078		return ERR_PTR(-ENOMEM);
1079
1080	if (req.total_num_uuars == 0)
1081		return ERR_PTR(-EINVAL);
1082
1083	if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
1084		return ERR_PTR(-EOPNOTSUPP);
1085
1086	if (reqlen > sizeof(req) &&
1087	    !ib_is_udata_cleared(udata, sizeof(req),
1088				 reqlen - sizeof(req)))
1089		return ERR_PTR(-EOPNOTSUPP);
1090
1091	req.total_num_uuars = ALIGN(req.total_num_uuars,
1092				    MLX5_NON_FP_BF_REGS_PER_PAGE);
1093	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
1094		return ERR_PTR(-EINVAL);
1095
1096	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
1097	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
1098	resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
1099	if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
1100		resp.bf_reg_size = 1 << MLX5_CAP_GEN(dev->mdev, log_bf_reg_size);
1101	resp.cache_line_size = cache_line_size();
1102	resp.max_sq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq);
1103	resp.max_rq_desc_sz = MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq);
1104	resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1105	resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz);
1106	resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
1107	resp.cqe_version = min_t(__u8,
1108				 (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version),
1109				 req.max_cqe_version);
1110	resp.response_length = min(offsetof(typeof(resp), response_length) +
1111				   sizeof(resp.response_length), udata->outlen);
1112
1113	context = kzalloc(sizeof(*context), GFP_KERNEL);
1114	if (!context)
1115		return ERR_PTR(-ENOMEM);
1116
1117	uuari = &context->uuari;
1118	mutex_init(&uuari->lock);
1119	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
1120	if (!uars) {
1121		err = -ENOMEM;
1122		goto out_ctx;
1123	}
1124
1125	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
1126				sizeof(*uuari->bitmap),
1127				GFP_KERNEL);
1128	if (!uuari->bitmap) {
1129		err = -ENOMEM;
1130		goto out_uar_ctx;
1131	}
1132	/*
1133	 * clear all fast path uuars
1134	 */
1135	for (i = 0; i < gross_uuars; i++) {
1136		uuarn = i & 3;
1137		if (uuarn == 2 || uuarn == 3)
1138			set_bit(i, uuari->bitmap);
1139	}
1140
1141	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
1142	if (!uuari->count) {
1143		err = -ENOMEM;
1144		goto out_bitmap;
1145	}
1146
1147	for (i = 0; i < num_uars; i++) {
1148		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
1149		if (err)
1150			goto out_count;
1151	}
1152
1153#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1154	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
1155#endif
1156
1157	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) {
1158		err = mlx5_alloc_transport_domain(dev->mdev,
1159						       &context->tdn);
1160		if (err)
1161			goto out_uars;
1162	}
1163
1164	INIT_LIST_HEAD(&context->vma_private_list);
1165	INIT_LIST_HEAD(&context->db_page_list);
1166	mutex_init(&context->db_page_mutex);
1167
1168	resp.tot_uuars = req.total_num_uuars;
1169	resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports);
1170
1171	if (field_avail(typeof(resp), cqe_version, udata->outlen))
1172		resp.response_length += sizeof(resp.cqe_version);
1173
1174	if (field_avail(typeof(resp), cmds_supp_uhw, udata->outlen)) {
1175		resp.cmds_supp_uhw |= MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE |
1176				      MLX5_USER_CMDS_SUPP_UHW_CREATE_AH;
1177		resp.response_length += sizeof(resp.cmds_supp_uhw);
1178	}
1179
1180	/*
1181	 * We don't want to expose information from the PCI bar that is located
1182	 * after 4096 bytes, so if the arch only supports larger pages, let's
1183	 * pretend we don't support reading the HCA's core clock. This is also
1184	 * forced by mmap function.
1185	 */
1186	if (PAGE_SIZE <= 4096 &&
1187	    field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) {
1188		resp.comp_mask |=
1189			MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET;
1190		resp.hca_core_clock_offset =
1191			offsetof(struct mlx5_init_seg, internal_timer_h) %
1192			PAGE_SIZE;
1193		resp.response_length += sizeof(resp.hca_core_clock_offset) +
1194					sizeof(resp.reserved2);
1195	}
1196
1197	err = ib_copy_to_udata(udata, &resp, resp.response_length);
1198	if (err)
1199		goto out_td;
1200
1201	uuari->ver = ver;
1202	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
1203	uuari->uars = uars;
1204	uuari->num_uars = num_uars;
1205	context->cqe_version = resp.cqe_version;
1206
1207	return &context->ibucontext;
1208
1209out_td:
1210	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1211		mlx5_dealloc_transport_domain(dev->mdev, context->tdn);
1212
1213out_uars:
1214	for (i--; i >= 0; i--)
1215		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
1216out_count:
1217	kfree(uuari->count);
1218
1219out_bitmap:
1220	kfree(uuari->bitmap);
1221
1222out_uar_ctx:
1223	kfree(uars);
1224
1225out_ctx:
1226	kfree(context);
1227	return ERR_PTR(err);
1228}
1229
1230static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
1231{
1232	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1233	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1234	struct mlx5_uuar_info *uuari = &context->uuari;
1235	int i;
1236
1237	if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain))
1238		mlx5_dealloc_transport_domain(dev->mdev, context->tdn);
1239
1240	for (i = 0; i < uuari->num_uars; i++) {
1241		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
1242			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
1243	}
1244
1245	kfree(uuari->count);
1246	kfree(uuari->bitmap);
1247	kfree(uuari->uars);
1248	kfree(context);
1249
1250	return 0;
1251}
1252
1253static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
1254{
1255	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
1256}
1257
1258static int get_command(unsigned long offset)
1259{
1260	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
1261}
1262
1263static int get_arg(unsigned long offset)
1264{
1265	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
1266}
1267
1268static int get_index(unsigned long offset)
1269{
1270	return get_arg(offset);
1271}
1272
1273static void  mlx5_ib_vma_open(struct vm_area_struct *area)
1274{
1275	/* vma_open is called when a new VMA is created on top of our VMA.  This
1276	 * is done through either mremap flow or split_vma (usually due to
1277	 * mlock, madvise, munmap, etc.) We do not support a clone of the VMA,
1278	 * as this VMA is strongly hardware related.  Therefore we set the
1279	 * vm_ops of the newly created/cloned VMA to NULL, to prevent it from
1280	 * calling us again and trying to do incorrect actions.  We assume that
1281	 * the original VMA size is exactly a single page, and therefore all
1282	 * "splitting" operation will not happen to it.
1283	 */
1284	area->vm_ops = NULL;
1285}
1286
1287static void  mlx5_ib_vma_close(struct vm_area_struct *area)
1288{
1289	struct mlx5_ib_vma_private_data *mlx5_ib_vma_priv_data;
1290
1291	/* It's guaranteed that all VMAs opened on a FD are closed before the
1292	 * file itself is closed, therefore no sync is needed with the regular
1293	 * closing flow. (e.g. mlx5 ib_dealloc_ucontext)
1294	 * However need a sync with accessing the vma as part of
1295	 * mlx5_ib_disassociate_ucontext.
1296	 * The close operation is usually called under mm->mmap_sem except when
1297	 * process is exiting.
1298	 * The exiting case is handled explicitly as part of
1299	 * mlx5_ib_disassociate_ucontext.
1300	 */
1301	mlx5_ib_vma_priv_data = (struct mlx5_ib_vma_private_data *)area->vm_private_data;
1302
1303	/* setting the vma context pointer to null in the mlx5_ib driver's
1304	 * private data, to protect a race condition in
1305	 * mlx5_ib_disassociate_ucontext().
1306	 */
1307	mlx5_ib_vma_priv_data->vma = NULL;
1308	list_del(&mlx5_ib_vma_priv_data->list);
1309	kfree(mlx5_ib_vma_priv_data);
1310}
1311
1312static const struct vm_operations_struct mlx5_ib_vm_ops = {
1313	.open = mlx5_ib_vma_open,
1314	.close = mlx5_ib_vma_close
1315};
1316
1317static int mlx5_ib_set_vma_data(struct vm_area_struct *vma,
1318				struct mlx5_ib_ucontext *ctx)
1319{
1320	struct mlx5_ib_vma_private_data *vma_prv;
1321	struct list_head *vma_head = &ctx->vma_private_list;
1322
1323	vma_prv = kzalloc(sizeof(*vma_prv), GFP_KERNEL);
1324	if (!vma_prv)
1325		return -ENOMEM;
1326
1327	vma_prv->vma = vma;
1328	vma->vm_private_data = vma_prv;
1329	vma->vm_ops =  &mlx5_ib_vm_ops;
1330
1331	list_add(&vma_prv->list, vma_head);
1332
1333	return 0;
1334}
1335
1336static inline char *mmap_cmd2str(enum mlx5_ib_mmap_cmd cmd)
1337{
1338	switch (cmd) {
1339	case MLX5_IB_MMAP_WC_PAGE:
1340		return "WC";
1341	case MLX5_IB_MMAP_REGULAR_PAGE:
1342		return "best effort WC";
1343	case MLX5_IB_MMAP_NC_PAGE:
1344		return "NC";
1345	default:
1346		return NULL;
1347	}
1348}
1349
1350static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
1351		    struct vm_area_struct *vma,
1352		    struct mlx5_ib_ucontext *context)
1353{
1354	struct mlx5_uuar_info *uuari = &context->uuari;
1355	int err;
1356	unsigned long idx;
1357	phys_addr_t pfn, pa;
1358	pgprot_t prot;
1359
1360	switch (cmd) {
1361	case MLX5_IB_MMAP_WC_PAGE:
1362/* Some architectures don't support WC memory */
1363#if defined(CONFIG_X86)
1364		if (!pat_enabled())
1365			return -EPERM;
1366#elif !(defined(CONFIG_PPC) || (defined(CONFIG_ARM) && defined(CONFIG_MMU)))
1367			return -EPERM;
1368#endif
1369	/* fall through */
1370	case MLX5_IB_MMAP_REGULAR_PAGE:
1371		/* For MLX5_IB_MMAP_REGULAR_PAGE do the best effort to get WC */
1372		prot = pgprot_writecombine(vma->vm_page_prot);
1373		break;
1374	case MLX5_IB_MMAP_NC_PAGE:
1375		prot = pgprot_noncached(vma->vm_page_prot);
1376		break;
1377	default:
1378		return -EINVAL;
1379	}
1380
1381	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1382		return -EINVAL;
1383
1384	idx = get_index(vma->vm_pgoff);
1385	if (idx >= uuari->num_uars)
1386		return -EINVAL;
1387
1388	pfn = uar_index2pfn(dev, uuari->uars[idx].index);
1389	mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn %pa\n", idx, &pfn);
1390
1391	vma->vm_page_prot = prot;
1392	err = io_remap_pfn_range(vma, vma->vm_start, pfn,
1393				 PAGE_SIZE, vma->vm_page_prot);
1394	if (err) {
1395		mlx5_ib_err(dev, "io_remap_pfn_range failed with error=%d, vm_start=0x%llx, pfn=%pa, mmap_cmd=%s\n",
1396			    err, (unsigned long long)vma->vm_start, &pfn, mmap_cmd2str(cmd));
1397		return -EAGAIN;
1398	}
1399
1400	pa = pfn << PAGE_SHIFT;
1401	mlx5_ib_dbg(dev, "mapped %s at 0x%llx, PA %pa\n", mmap_cmd2str(cmd),
1402		    (unsigned long long)vma->vm_start, &pa);
1403
1404	return mlx5_ib_set_vma_data(vma, context);
1405}
1406
1407static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
1408{
1409	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
1410	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
1411	unsigned long command;
1412	phys_addr_t pfn;
1413
1414	command = get_command(vma->vm_pgoff);
1415	switch (command) {
1416	case MLX5_IB_MMAP_WC_PAGE:
1417	case MLX5_IB_MMAP_NC_PAGE:
1418	case MLX5_IB_MMAP_REGULAR_PAGE:
1419		return uar_mmap(dev, command, vma, context);
1420
1421	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
1422		return -ENOSYS;
1423
1424	case MLX5_IB_MMAP_CORE_CLOCK:
1425		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
1426			return -EINVAL;
1427
1428		if (vma->vm_flags & VM_WRITE)
1429			return -EPERM;
1430
1431		/* Don't expose to user-space information it shouldn't have */
1432		if (PAGE_SIZE > 4096)
1433			return -EOPNOTSUPP;
1434
1435		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
1436		pfn = (dev->mdev->iseg_base +
1437		       offsetof(struct mlx5_init_seg, internal_timer_h)) >>
1438			PAGE_SHIFT;
1439		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
1440				       PAGE_SIZE, vma->vm_page_prot))
1441			return -EAGAIN;
1442
1443		mlx5_ib_dbg(dev, "mapped internal timer at 0x%llx, PA 0x%llx\n",
1444			    (unsigned long long)vma->vm_start,
1445			    (unsigned long long)pfn << PAGE_SHIFT);
1446		break;
1447
1448	default:
1449		return -EINVAL;
1450	}
1451
1452	return 0;
1453}
1454
1455static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
1456				      struct ib_ucontext *context,
1457				      struct ib_udata *udata)
1458{
1459	struct mlx5_ib_alloc_pd_resp resp;
1460	struct mlx5_ib_pd *pd;
1461	int err;
1462
1463	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
1464	if (!pd)
1465		return ERR_PTR(-ENOMEM);
1466
1467	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
1468	if (err) {
1469		kfree(pd);
1470		return ERR_PTR(err);
1471	}
1472
1473	if (context) {
1474		resp.pdn = pd->pdn;
1475		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
1476			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
1477			kfree(pd);
1478			return ERR_PTR(-EFAULT);
1479		}
1480	}
1481
1482	return &pd->ibpd;
1483}
1484
1485static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
1486{
1487	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
1488	struct mlx5_ib_pd *mpd = to_mpd(pd);
1489
1490	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
1491	kfree(mpd);
1492
1493	return 0;
1494}
1495
1496enum {
1497	MATCH_CRITERIA_ENABLE_OUTER_BIT,
1498	MATCH_CRITERIA_ENABLE_MISC_BIT,
1499	MATCH_CRITERIA_ENABLE_INNER_BIT
1500};
1501
1502#define HEADER_IS_ZERO(match_criteria, headers)			           \
1503	!(memchr_inv(MLX5_ADDR_OF(fte_match_param, match_criteria, headers), \
1504		    0, MLX5_FLD_SZ_BYTES(fte_match_param, headers)))       \
1505
1506static u8 get_match_criteria_enable(u32 *match_criteria)
1507{
1508	u8 match_criteria_enable;
1509
1510	match_criteria_enable =
1511		(!HEADER_IS_ZERO(match_criteria, outer_headers)) <<
1512		MATCH_CRITERIA_ENABLE_OUTER_BIT;
1513	match_criteria_enable |=
1514		(!HEADER_IS_ZERO(match_criteria, misc_parameters)) <<
1515		MATCH_CRITERIA_ENABLE_MISC_BIT;
1516	match_criteria_enable |=
1517		(!HEADER_IS_ZERO(match_criteria, inner_headers)) <<
1518		MATCH_CRITERIA_ENABLE_INNER_BIT;
1519
1520	return match_criteria_enable;
1521}
1522
1523static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
1524{
1525	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
1526	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
1527}
1528
1529static void set_tos(void *outer_c, void *outer_v, u8 mask, u8 val)
1530{
1531	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_ecn, mask);
1532	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_ecn, val);
1533	MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_dscp, mask >> 2);
1534	MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_dscp, val >> 2);
1535}
1536
1537#define LAST_ETH_FIELD vlan_tag
1538#define LAST_IB_FIELD sl
1539#define LAST_IPV4_FIELD tos
1540#define LAST_IPV6_FIELD traffic_class
1541#define LAST_TCP_UDP_FIELD src_port
1542
1543/* Field is the last supported field */
1544#define FIELDS_NOT_SUPPORTED(filter, field)\
1545	memchr_inv((void *)&filter.field  +\
1546		   sizeof(filter.field), 0,\
1547		   sizeof(filter) -\
1548		   offsetof(typeof(filter), field) -\
1549		   sizeof(filter.field))
1550
1551static int parse_flow_attr(u32 *match_c, u32 *match_v,
1552			   const union ib_flow_spec *ib_spec)
1553{
1554	void *outer_headers_c = MLX5_ADDR_OF(fte_match_param, match_c,
1555					     outer_headers);
1556	void *outer_headers_v = MLX5_ADDR_OF(fte_match_param, match_v,
1557					     outer_headers);
1558	void *misc_params_c = MLX5_ADDR_OF(fte_match_param, match_c,
1559					   misc_parameters);
1560	void *misc_params_v = MLX5_ADDR_OF(fte_match_param, match_v,
1561					   misc_parameters);
1562
1563	switch (ib_spec->type) {
1564	case IB_FLOW_SPEC_ETH:
1565		if (FIELDS_NOT_SUPPORTED(ib_spec->eth.mask, LAST_ETH_FIELD))
1566			return -ENOTSUPP;
1567
1568		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1569					     dmac_47_16),
1570				ib_spec->eth.mask.dst_mac);
1571		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1572					     dmac_47_16),
1573				ib_spec->eth.val.dst_mac);
1574
1575		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1576					     smac_47_16),
1577				ib_spec->eth.mask.src_mac);
1578		ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1579					     smac_47_16),
1580				ib_spec->eth.val.src_mac);
1581
1582		if (ib_spec->eth.mask.vlan_tag) {
1583			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1584				 cvlan_tag, 1);
1585			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1586				 cvlan_tag, 1);
1587
1588			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1589				 first_vid, ntohs(ib_spec->eth.mask.vlan_tag));
1590			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1591				 first_vid, ntohs(ib_spec->eth.val.vlan_tag));
1592
1593			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1594				 first_cfi,
1595				 ntohs(ib_spec->eth.mask.vlan_tag) >> 12);
1596			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1597				 first_cfi,
1598				 ntohs(ib_spec->eth.val.vlan_tag) >> 12);
1599
1600			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1601				 first_prio,
1602				 ntohs(ib_spec->eth.mask.vlan_tag) >> 13);
1603			MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1604				 first_prio,
1605				 ntohs(ib_spec->eth.val.vlan_tag) >> 13);
1606		}
1607		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1608			 ethertype, ntohs(ib_spec->eth.mask.ether_type));
1609		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1610			 ethertype, ntohs(ib_spec->eth.val.ether_type));
1611		break;
1612	case IB_FLOW_SPEC_IPV4:
1613		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv4.mask, LAST_IPV4_FIELD))
1614			return -ENOTSUPP;
1615
1616		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1617			 ethertype, 0xffff);
1618		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1619			 ethertype, ETH_P_IP);
1620
1621		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1622				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1623		       &ib_spec->ipv4.mask.src_ip,
1624		       sizeof(ib_spec->ipv4.mask.src_ip));
1625		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1626				    src_ipv4_src_ipv6.ipv4_layout.ipv4),
1627		       &ib_spec->ipv4.val.src_ip,
1628		       sizeof(ib_spec->ipv4.val.src_ip));
1629		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1630				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1631		       &ib_spec->ipv4.mask.dst_ip,
1632		       sizeof(ib_spec->ipv4.mask.dst_ip));
1633		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1634				    dst_ipv4_dst_ipv6.ipv4_layout.ipv4),
1635		       &ib_spec->ipv4.val.dst_ip,
1636		       sizeof(ib_spec->ipv4.val.dst_ip));
1637
1638		set_tos(outer_headers_c, outer_headers_v,
1639			ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
1640
1641		set_proto(outer_headers_c, outer_headers_v,
1642			  ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
1643		break;
1644	case IB_FLOW_SPEC_IPV6:
1645		if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
1646			return -ENOTSUPP;
1647
1648		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
1649			 ethertype, 0xffff);
1650		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v,
1651			 ethertype, IPPROTO_IPV6);
1652
1653		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1654				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
1655		       &ib_spec->ipv6.mask.src_ip,
1656		       sizeof(ib_spec->ipv6.mask.src_ip));
1657		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1658				    src_ipv4_src_ipv6.ipv6_layout.ipv6),
1659		       &ib_spec->ipv6.val.src_ip,
1660		       sizeof(ib_spec->ipv6.val.src_ip));
1661		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
1662				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1663		       &ib_spec->ipv6.mask.dst_ip,
1664		       sizeof(ib_spec->ipv6.mask.dst_ip));
1665		memcpy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
1666				    dst_ipv4_dst_ipv6.ipv6_layout.ipv6),
1667		       &ib_spec->ipv6.val.dst_ip,
1668		       sizeof(ib_spec->ipv6.val.dst_ip));
1669
1670		set_tos(outer_headers_c, outer_headers_v,
1671			ib_spec->ipv6.mask.traffic_class,
1672			ib_spec->ipv6.val.traffic_class);
1673
1674		set_proto(outer_headers_c, outer_headers_v,
1675			  ib_spec->ipv6.mask.next_hdr,
1676			  ib_spec->ipv6.val.next_hdr);
1677
1678		MLX5_SET(fte_match_set_misc, misc_params_c,
1679			 outer_ipv6_flow_label,
1680			 ntohl(ib_spec->ipv6.mask.flow_label));
1681		MLX5_SET(fte_match_set_misc, misc_params_v,
1682			 outer_ipv6_flow_label,
1683			 ntohl(ib_spec->ipv6.val.flow_label));
1684		break;
1685	case IB_FLOW_SPEC_TCP:
1686		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1687					 LAST_TCP_UDP_FIELD))
1688			return -ENOTSUPP;
1689
1690		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1691			 0xff);
1692		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1693			 IPPROTO_TCP);
1694
1695		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_sport,
1696			 ntohs(ib_spec->tcp_udp.mask.src_port));
1697		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_sport,
1698			 ntohs(ib_spec->tcp_udp.val.src_port));
1699
1700		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, tcp_dport,
1701			 ntohs(ib_spec->tcp_udp.mask.dst_port));
1702		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, tcp_dport,
1703			 ntohs(ib_spec->tcp_udp.val.dst_port));
1704		break;
1705	case IB_FLOW_SPEC_UDP:
1706		if (FIELDS_NOT_SUPPORTED(ib_spec->tcp_udp.mask,
1707					 LAST_TCP_UDP_FIELD))
1708			return -ENOTSUPP;
1709
1710		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, ip_protocol,
1711			 0xff);
1712		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, ip_protocol,
1713			 IPPROTO_UDP);
1714
1715		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_sport,
1716			 ntohs(ib_spec->tcp_udp.mask.src_port));
1717		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_sport,
1718			 ntohs(ib_spec->tcp_udp.val.src_port));
1719
1720		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c, udp_dport,
1721			 ntohs(ib_spec->tcp_udp.mask.dst_port));
1722		MLX5_SET(fte_match_set_lyr_2_4, outer_headers_v, udp_dport,
1723			 ntohs(ib_spec->tcp_udp.val.dst_port));
1724		break;
1725	default:
1726		return -EINVAL;
1727	}
1728
1729	return 0;
1730}
1731
1732/* If a flow could catch both multicast and unicast packets,
1733 * it won't fall into the multicast flow steering table and this rule
1734 * could steal other multicast packets.
1735 */
1736static bool flow_is_multicast_only(struct ib_flow_attr *ib_attr)
1737{
1738	struct ib_flow_spec_eth *eth_spec;
1739
1740	if (ib_attr->type != IB_FLOW_ATTR_NORMAL ||
1741	    ib_attr->size < sizeof(struct ib_flow_attr) +
1742	    sizeof(struct ib_flow_spec_eth) ||
1743	    ib_attr->num_of_specs < 1)
1744		return false;
1745
1746	eth_spec = (struct ib_flow_spec_eth *)(ib_attr + 1);
1747	if (eth_spec->type != IB_FLOW_SPEC_ETH ||
1748	    eth_spec->size != sizeof(*eth_spec))
1749		return false;
1750
1751	return is_multicast_ether_addr(eth_spec->mask.dst_mac) &&
1752	       is_multicast_ether_addr(eth_spec->val.dst_mac);
1753}
1754
1755static bool is_valid_attr(const struct ib_flow_attr *flow_attr)
1756{
1757	union ib_flow_spec *ib_spec = (union ib_flow_spec *)(flow_attr + 1);
1758	bool has_ipv4_spec = false;
1759	bool eth_type_ipv4 = true;
1760	unsigned int spec_index;
1761
1762	/* Validate that ethertype is correct */
1763	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1764		if (ib_spec->type == IB_FLOW_SPEC_ETH &&
1765		    ib_spec->eth.mask.ether_type) {
1766			if (!((ib_spec->eth.mask.ether_type == htons(0xffff)) &&
1767			      ib_spec->eth.val.ether_type == htons(ETH_P_IP)))
1768				eth_type_ipv4 = false;
1769		} else if (ib_spec->type == IB_FLOW_SPEC_IPV4) {
1770			has_ipv4_spec = true;
1771		}
1772		ib_spec = (void *)ib_spec + ib_spec->size;
1773	}
1774	return !has_ipv4_spec || eth_type_ipv4;
1775}
1776
1777static void put_flow_table(struct mlx5_ib_dev *dev,
1778			   struct mlx5_ib_flow_prio *prio, bool ft_added)
1779{
1780	prio->refcount -= !!ft_added;
1781	if (!prio->refcount) {
1782		mlx5_destroy_flow_table(prio->flow_table);
1783		prio->flow_table = NULL;
1784	}
1785}
1786
1787static int mlx5_ib_destroy_flow(struct ib_flow *flow_id)
1788{
1789	struct mlx5_ib_dev *dev = to_mdev(flow_id->qp->device);
1790	struct mlx5_ib_flow_handler *handler = container_of(flow_id,
1791							  struct mlx5_ib_flow_handler,
1792							  ibflow);
1793	struct mlx5_ib_flow_handler *iter, *tmp;
1794
1795	mutex_lock(&dev->flow_db.lock);
1796
1797	list_for_each_entry_safe(iter, tmp, &handler->list, list) {
1798		mlx5_del_flow_rule(iter->rule);
1799		put_flow_table(dev, iter->prio, true);
1800		list_del(&iter->list);
1801		kfree(iter);
1802	}
1803
1804	mlx5_del_flow_rule(handler->rule);
1805	put_flow_table(dev, handler->prio, true);
1806	mutex_unlock(&dev->flow_db.lock);
1807
1808	kfree(handler);
1809
1810	return 0;
1811}
1812
1813static int ib_prio_to_core_prio(unsigned int priority, bool dont_trap)
1814{
1815	priority *= 2;
1816	if (!dont_trap)
1817		priority++;
1818	return priority;
1819}
1820
1821enum flow_table_type {
1822	MLX5_IB_FT_RX,
1823	MLX5_IB_FT_TX
1824};
1825
1826#define MLX5_FS_MAX_TYPES	 10
1827#define MLX5_FS_MAX_ENTRIES	 32000UL
1828static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
1829						struct ib_flow_attr *flow_attr,
1830						enum flow_table_type ft_type)
1831{
1832	bool dont_trap = flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP;
1833	struct mlx5_flow_namespace *ns = NULL;
1834	struct mlx5_ib_flow_prio *prio;
1835	struct mlx5_flow_table *ft;
1836	int num_entries;
1837	int num_groups;
1838	int priority;
1839	int err = 0;
1840
1841	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
1842		if (flow_is_multicast_only(flow_attr) &&
1843		    !dont_trap)
1844			priority = MLX5_IB_FLOW_MCAST_PRIO;
1845		else
1846			priority = ib_prio_to_core_prio(flow_attr->priority,
1847							dont_trap);
1848		ns = mlx5_get_flow_namespace(dev->mdev,
1849					     MLX5_FLOW_NAMESPACE_BYPASS);
1850		num_entries = MLX5_FS_MAX_ENTRIES;
1851		num_groups = MLX5_FS_MAX_TYPES;
1852		prio = &dev->flow_db.prios[priority];
1853	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
1854		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
1855		ns = mlx5_get_flow_namespace(dev->mdev,
1856					     MLX5_FLOW_NAMESPACE_LEFTOVERS);
1857		build_leftovers_ft_param("bypass", &priority,
1858					 &num_entries,
1859					 &num_groups);
1860		prio = &dev->flow_db.prios[MLX5_IB_FLOW_LEFTOVERS_PRIO];
1861	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
1862		if (!MLX5_CAP_FLOWTABLE(dev->mdev,
1863					allow_sniffer_and_nic_rx_shared_tir))
1864			return ERR_PTR(-ENOTSUPP);
1865
1866		ns = mlx5_get_flow_namespace(dev->mdev, ft_type == MLX5_IB_FT_RX ?
1867					     MLX5_FLOW_NAMESPACE_SNIFFER_RX :
1868					     MLX5_FLOW_NAMESPACE_SNIFFER_TX);
1869
1870		prio = &dev->flow_db.sniffer[ft_type];
1871		priority = 0;
1872		num_entries = 1;
1873		num_groups = 1;
1874	}
1875
1876	if (!ns)
1877		return ERR_PTR(-ENOTSUPP);
1878
1879	ft = prio->flow_table;
1880	if (!ft) {
1881		ft = mlx5_create_auto_grouped_flow_table(ns, priority, "bypass",
1882							 num_entries,
1883							 num_groups);
1884
1885		if (!IS_ERR(ft)) {
1886			prio->refcount = 0;
1887			prio->flow_table = ft;
1888		} else {
1889			err = PTR_ERR(ft);
1890		}
1891	}
1892
1893	return err ? ERR_PTR(err) : prio;
1894}
1895
1896static struct mlx5_ib_flow_handler *create_flow_rule(struct mlx5_ib_dev *dev,
1897						     struct mlx5_ib_flow_prio *ft_prio,
1898						     const struct ib_flow_attr *flow_attr,
1899						     struct mlx5_flow_destination *dst)
1900{
1901	struct mlx5_flow_table	*ft = ft_prio->flow_table;
1902	struct mlx5_ib_flow_handler *handler;
1903	struct mlx5_flow_spec *spec;
1904	const void *ib_flow = (const void *)flow_attr + sizeof(*flow_attr);
1905	unsigned int spec_index;
1906	u32 action;
1907	int err = 0;
1908
1909	if (!is_valid_attr(flow_attr))
1910		return ERR_PTR(-EINVAL);
1911
1912	spec = mlx5_vzalloc(sizeof(*spec));
1913	handler = kzalloc(sizeof(*handler), GFP_KERNEL);
1914	if (!handler || !spec) {
1915		err = -ENOMEM;
1916		goto free;
1917	}
1918
1919	INIT_LIST_HEAD(&handler->list);
1920
1921	for (spec_index = 0; spec_index < flow_attr->num_of_specs; spec_index++) {
1922		err = parse_flow_attr(spec->match_criteria,
1923				      spec->match_value, ib_flow);
1924		if (err < 0)
1925			goto free;
1926
1927		ib_flow += ((union ib_flow_spec *)ib_flow)->size;
1928	}
1929
1930	spec->match_criteria_enable = get_match_criteria_enable(spec->match_criteria);
1931	action = dst ? MLX5_FLOW_CONTEXT_ACTION_FWD_DEST :
1932		MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO;
1933	handler->rule = mlx5_add_flow_rule(ft, spec->match_criteria_enable,
1934					   spec->match_criteria,
1935					   spec->match_value,
1936					   action,
1937					   MLX5_FS_DEFAULT_FLOW_TAG,
1938					   dst);
1939
1940	if (IS_ERR(handler->rule)) {
1941		err = PTR_ERR(handler->rule);
1942		goto free;
1943	}
1944
1945	ft_prio->refcount++;
1946	handler->prio = ft_prio;
1947
1948	ft_prio->flow_table = ft;
1949free:
1950	if (err)
1951		kfree(handler);
1952	kvfree(spec);
1953	return err ? ERR_PTR(err) : handler;
1954}
1955
1956static struct mlx5_ib_flow_handler *create_dont_trap_rule(struct mlx5_ib_dev *dev,
1957							  struct mlx5_ib_flow_prio *ft_prio,
1958							  struct ib_flow_attr *flow_attr,
1959							  struct mlx5_flow_destination *dst)
1960{
1961	struct mlx5_ib_flow_handler *handler_dst = NULL;
1962	struct mlx5_ib_flow_handler *handler = NULL;
1963
1964	handler = create_flow_rule(dev, ft_prio, flow_attr, NULL);
1965	if (!IS_ERR(handler)) {
1966		handler_dst = create_flow_rule(dev, ft_prio,
1967					       flow_attr, dst);
1968		if (IS_ERR(handler_dst)) {
1969			mlx5_del_flow_rule(handler->rule);
1970			ft_prio->refcount--;
1971			kfree(handler);
1972			handler = handler_dst;
1973		} else {
1974			list_add(&handler_dst->list, &handler->list);
1975		}
1976	}
1977
1978	return handler;
1979}
1980enum {
1981	LEFTOVERS_MC,
1982	LEFTOVERS_UC,
1983};
1984
1985static struct mlx5_ib_flow_handler *create_leftovers_rule(struct mlx5_ib_dev *dev,
1986							  struct mlx5_ib_flow_prio *ft_prio,
1987							  struct ib_flow_attr *flow_attr,
1988							  struct mlx5_flow_destination *dst)
1989{
1990	struct mlx5_ib_flow_handler *handler_ucast = NULL;
1991	struct mlx5_ib_flow_handler *handler = NULL;
1992
1993	static struct {
1994		struct ib_flow_attr	flow_attr;
1995		struct ib_flow_spec_eth eth_flow;
1996	} leftovers_specs[] = {
1997		[LEFTOVERS_MC] = {
1998			.flow_attr = {
1999				.num_of_specs = 1,
2000				.size = sizeof(leftovers_specs[0])
2001			},
2002			.eth_flow = {
2003				.type = IB_FLOW_SPEC_ETH,
2004				.size = sizeof(struct ib_flow_spec_eth),
2005				.mask = {.dst_mac = {0x1} },
2006				.val =  {.dst_mac = {0x1} }
2007			}
2008		},
2009		[LEFTOVERS_UC] = {
2010			.flow_attr = {
2011				.num_of_specs = 1,
2012				.size = sizeof(leftovers_specs[0])
2013			},
2014			.eth_flow = {
2015				.type = IB_FLOW_SPEC_ETH,
2016				.size = sizeof(struct ib_flow_spec_eth),
2017				.mask = {.dst_mac = {0x1} },
2018				.val = {.dst_mac = {} }
2019			}
2020		}
2021	};
2022
2023	handler = create_flow_rule(dev, ft_prio,
2024				   &leftovers_specs[LEFTOVERS_MC].flow_attr,
2025				   dst);
2026	if (!IS_ERR(handler) &&
2027	    flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT) {
2028		handler_ucast = create_flow_rule(dev, ft_prio,
2029						 &leftovers_specs[LEFTOVERS_UC].flow_attr,
2030						 dst);
2031		if (IS_ERR(handler_ucast)) {
2032			mlx5_del_flow_rule(handler->rule);
2033			ft_prio->refcount--;
2034			kfree(handler);
2035			handler = handler_ucast;
2036		} else {
2037			list_add(&handler_ucast->list, &handler->list);
2038		}
2039	}
2040
2041	return handler;
2042}
2043
2044static struct mlx5_ib_flow_handler *create_sniffer_rule(struct mlx5_ib_dev *dev,
2045							struct mlx5_ib_flow_prio *ft_rx,
2046							struct mlx5_ib_flow_prio *ft_tx,
2047							struct mlx5_flow_destination *dst)
2048{
2049	struct mlx5_ib_flow_handler *handler_rx;
2050	struct mlx5_ib_flow_handler *handler_tx;
2051	int err;
2052	static const struct ib_flow_attr flow_attr  = {
2053		.num_of_specs = 0,
2054		.size = sizeof(flow_attr)
2055	};
2056
2057	handler_rx = create_flow_rule(dev, ft_rx, &flow_attr, dst);
2058	if (IS_ERR(handler_rx)) {
2059		err = PTR_ERR(handler_rx);
2060		goto err;
2061	}
2062
2063	handler_tx = create_flow_rule(dev, ft_tx, &flow_attr, dst);
2064	if (IS_ERR(handler_tx)) {
2065		err = PTR_ERR(handler_tx);
2066		goto err_tx;
2067	}
2068
2069	list_add(&handler_tx->list, &handler_rx->list);
2070
2071	return handler_rx;
2072
2073err_tx:
2074	mlx5_del_flow_rule(handler_rx->rule);
2075	ft_rx->refcount--;
2076	kfree(handler_rx);
2077err:
2078	return ERR_PTR(err);
2079}
2080
2081static struct ib_flow *mlx5_ib_create_flow(struct ib_qp *qp,
2082					   struct ib_flow_attr *flow_attr,
2083					   int domain)
2084{
2085	struct mlx5_ib_dev *dev = to_mdev(qp->device);
2086	struct mlx5_ib_qp *mqp = to_mqp(qp);
2087	struct mlx5_ib_flow_handler *handler = NULL;
2088	struct mlx5_flow_destination *dst = NULL;
2089	struct mlx5_ib_flow_prio *ft_prio_tx = NULL;
2090	struct mlx5_ib_flow_prio *ft_prio;
2091	int err;
2092
2093	if (flow_attr->priority > MLX5_IB_FLOW_LAST_PRIO)
2094		return ERR_PTR(-ENOSPC);
2095
2096	if (domain != IB_FLOW_DOMAIN_USER ||
2097	    flow_attr->port > MLX5_CAP_GEN(dev->mdev, num_ports) ||
2098	    (flow_attr->flags & ~IB_FLOW_ATTR_FLAGS_DONT_TRAP))
2099		return ERR_PTR(-EINVAL);
2100
2101	dst = kzalloc(sizeof(*dst), GFP_KERNEL);
2102	if (!dst)
2103		return ERR_PTR(-ENOMEM);
2104
2105	mutex_lock(&dev->flow_db.lock);
2106
2107	ft_prio = get_flow_table(dev, flow_attr, MLX5_IB_FT_RX);
2108	if (IS_ERR(ft_prio)) {
2109		err = PTR_ERR(ft_prio);
2110		goto unlock;
2111	}
2112	if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2113		ft_prio_tx = get_flow_table(dev, flow_attr, MLX5_IB_FT_TX);
2114		if (IS_ERR(ft_prio_tx)) {
2115			err = PTR_ERR(ft_prio_tx);
2116			ft_prio_tx = NULL;
2117			goto destroy_ft;
2118		}
2119	}
2120
2121	dst->type = MLX5_FLOW_DESTINATION_TYPE_TIR;
2122	if (mqp->flags & MLX5_IB_QP_RSS)
2123		dst->tir_num = mqp->rss_qp.tirn;
2124	else
2125		dst->tir_num = mqp->raw_packet_qp.rq.tirn;
2126
2127	if (flow_attr->type == IB_FLOW_ATTR_NORMAL) {
2128		if (flow_attr->flags & IB_FLOW_ATTR_FLAGS_DONT_TRAP)  {
2129			handler = create_dont_trap_rule(dev, ft_prio,
2130							flow_attr, dst);
2131		} else {
2132			handler = create_flow_rule(dev, ft_prio, flow_attr,
2133						   dst);
2134		}
2135	} else if (flow_attr->type == IB_FLOW_ATTR_ALL_DEFAULT ||
2136		   flow_attr->type == IB_FLOW_ATTR_MC_DEFAULT) {
2137		handler = create_leftovers_rule(dev, ft_prio, flow_attr,
2138						dst);
2139	} else if (flow_attr->type == IB_FLOW_ATTR_SNIFFER) {
2140		handler = create_sniffer_rule(dev, ft_prio, ft_prio_tx, dst);
2141	} else {
2142		err = -EINVAL;
2143		goto destroy_ft;
2144	}
2145
2146	if (IS_ERR(handler)) {
2147		err = PTR_ERR(handler);
2148		handler = NULL;
2149		goto destroy_ft;
2150	}
2151
2152	mutex_unlock(&dev->flow_db.lock);
2153	kfree(dst);
2154
2155	return &handler->ibflow;
2156
2157destroy_ft:
2158	put_flow_table(dev, ft_prio, false);
2159	if (ft_prio_tx)
2160		put_flow_table(dev, ft_prio_tx, false);
2161unlock:
2162	mutex_unlock(&dev->flow_db.lock);
2163	kfree(dst);
2164	kfree(handler);
2165	return ERR_PTR(err);
2166}
2167
2168static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2169{
2170	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2171	int err;
2172
2173	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
2174	if (err)
2175		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
2176			     ibqp->qp_num, gid->raw);
2177
2178	return err;
2179}
2180
2181static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
2182{
2183	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2184	int err;
2185
2186	err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
2187	if (err)
2188		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
2189			     ibqp->qp_num, gid->raw);
2190
2191	return err;
2192}
2193
2194static int init_node_data(struct mlx5_ib_dev *dev)
2195{
2196	int err;
2197
2198	err = mlx5_query_node_desc(dev, dev->ib_dev.node_desc);
2199	if (err)
2200		return err;
2201
2202	return mlx5_query_node_guid(dev, &dev->ib_dev.node_guid);
2203}
2204
2205static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
2206			     char *buf)
2207{
2208	struct mlx5_ib_dev *dev =
2209		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2210
2211	return sprintf(buf, "%lld\n", (long long)dev->mdev->priv.fw_pages);
2212}
2213
2214static ssize_t show_reg_pages(struct device *device,
2215			      struct device_attribute *attr, char *buf)
2216{
2217	struct mlx5_ib_dev *dev =
2218		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2219
2220	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
2221}
2222
2223static ssize_t show_hca(struct device *device, struct device_attribute *attr,
2224			char *buf)
2225{
2226	struct mlx5_ib_dev *dev =
2227		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2228	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
2229}
2230
2231static ssize_t show_rev(struct device *device, struct device_attribute *attr,
2232			char *buf)
2233{
2234	struct mlx5_ib_dev *dev =
2235		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2236	return sprintf(buf, "%x\n", dev->mdev->pdev->revision);
2237}
2238
2239static ssize_t show_board(struct device *device, struct device_attribute *attr,
2240			  char *buf)
2241{
2242	struct mlx5_ib_dev *dev =
2243		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
2244	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
2245		       dev->mdev->board_id);
2246}
2247
2248static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
2249static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
2250static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
2251static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
2252static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
2253
2254static struct device_attribute *mlx5_class_attributes[] = {
2255	&dev_attr_hw_rev,
2256	&dev_attr_hca_type,
2257	&dev_attr_board_id,
2258	&dev_attr_fw_pages,
2259	&dev_attr_reg_pages,
2260};
2261
2262static void pkey_change_handler(struct work_struct *work)
2263{
2264	struct mlx5_ib_port_resources *ports =
2265		container_of(work, struct mlx5_ib_port_resources,
2266			     pkey_change_work);
2267
2268	mutex_lock(&ports->devr->mutex);
2269	mlx5_ib_gsi_pkey_change(ports->gsi);
2270	mutex_unlock(&ports->devr->mutex);
2271}
2272
2273static void mlx5_ib_handle_internal_error(struct mlx5_ib_dev *ibdev)
2274{
2275	struct mlx5_ib_qp *mqp;
2276	struct mlx5_ib_cq *send_mcq, *recv_mcq;
2277	struct mlx5_core_cq *mcq;
2278	struct list_head cq_armed_list;
2279	unsigned long flags_qp;
2280	unsigned long flags_cq;
2281	unsigned long flags;
2282
2283	INIT_LIST_HEAD(&cq_armed_list);
2284
2285	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
2286	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
2287	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
2288		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
2289		if (mqp->sq.tail != mqp->sq.head) {
2290			send_mcq = to_mcq(mqp->ibqp.send_cq);
2291			spin_lock_irqsave(&send_mcq->lock, flags_cq);
2292			if (send_mcq->mcq.comp &&
2293			    mqp->ibqp.send_cq->comp_handler) {
2294				if (!send_mcq->mcq.reset_notify_added) {
2295					send_mcq->mcq.reset_notify_added = 1;
2296					list_add_tail(&send_mcq->mcq.reset_notify,
2297						      &cq_armed_list);
2298				}
2299			}
2300			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
2301		}
2302		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
2303		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
2304		/* no handling is needed for SRQ */
2305		if (!mqp->ibqp.srq) {
2306			if (mqp->rq.tail != mqp->rq.head) {
2307				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
2308				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
2309				if (recv_mcq->mcq.comp &&
2310				    mqp->ibqp.recv_cq->comp_handler) {
2311					if (!recv_mcq->mcq.reset_notify_added) {
2312						recv_mcq->mcq.reset_notify_added = 1;
2313						list_add_tail(&recv_mcq->mcq.reset_notify,
2314							      &cq_armed_list);
2315					}
2316				}
2317				spin_unlock_irqrestore(&recv_mcq->lock,
2318						       flags_cq);
2319			}
2320		}
2321		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
2322	}
2323	/*At that point all inflight post send were put to be executed as of we
2324	 * lock/unlock above locks Now need to arm all involved CQs.
2325	 */
2326	list_for_each_entry(mcq, &cq_armed_list, reset_notify) {
2327		mcq->comp(mcq);
2328	}
2329	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
2330}
2331
2332static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
2333			  enum mlx5_dev_event event, unsigned long param)
2334{
2335	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
2336	struct ib_event ibev;
2337	bool fatal = false;
2338	u8 port = (u8)param;
2339
2340	switch (event) {
2341	case MLX5_DEV_EVENT_SYS_ERROR:
2342		ibev.event = IB_EVENT_DEVICE_FATAL;
2343		mlx5_ib_handle_internal_error(ibdev);
2344		fatal = true;
2345		break;
2346
2347	case MLX5_DEV_EVENT_PORT_UP:
2348	case MLX5_DEV_EVENT_PORT_DOWN:
2349	case MLX5_DEV_EVENT_PORT_INITIALIZED:
2350		/* In RoCE, port up/down events are handled in
2351		 * mlx5_netdev_event().
2352		 */
2353		if (mlx5_ib_port_link_layer(&ibdev->ib_dev, port) ==
2354			IB_LINK_LAYER_ETHERNET)
2355			return;
2356
2357		ibev.event = (event == MLX5_DEV_EVENT_PORT_UP) ?
2358			     IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
2359		break;
2360
2361	case MLX5_DEV_EVENT_LID_CHANGE:
2362		ibev.event = IB_EVENT_LID_CHANGE;
2363		break;
2364
2365	case MLX5_DEV_EVENT_PKEY_CHANGE:
2366		ibev.event = IB_EVENT_PKEY_CHANGE;
2367
2368		schedule_work(&ibdev->devr.ports[port - 1].pkey_change_work);
2369		break;
2370
2371	case MLX5_DEV_EVENT_GUID_CHANGE:
2372		ibev.event = IB_EVENT_GID_CHANGE;
2373		break;
2374
2375	case MLX5_DEV_EVENT_CLIENT_REREG:
2376		ibev.event = IB_EVENT_CLIENT_REREGISTER;
2377		break;
2378
2379	default:
2380		/* unsupported event */
2381		return;
2382	}
2383
2384	ibev.device	      = &ibdev->ib_dev;
2385	ibev.element.port_num = port;
2386
2387	if (!rdma_is_port_valid(&ibdev->ib_dev, port)) {
2388		mlx5_ib_warn(ibdev, "warning: event(%d) on port %d\n", event, port);
2389		return;
2390	}
2391
2392	if (ibdev->ib_active)
2393		ib_dispatch_event(&ibev);
2394
2395	if (fatal)
2396		ibdev->ib_active = false;
2397}
2398
2399static void get_ext_port_caps(struct mlx5_ib_dev *dev)
2400{
2401	int port;
2402
2403	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++)
2404		mlx5_query_ext_port_caps(dev, port);
2405}
2406
2407static int get_port_caps(struct mlx5_ib_dev *dev)
2408{
2409	struct ib_device_attr *dprops = NULL;
2410	struct ib_port_attr *pprops = NULL;
2411	int err = -ENOMEM;
2412	int port;
2413	struct ib_udata uhw = {.inlen = 0, .outlen = 0};
2414
2415	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
2416	if (!pprops)
2417		goto out;
2418
2419	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
2420	if (!dprops)
2421		goto out;
2422
2423	err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
2424	if (err) {
2425		mlx5_ib_warn(dev, "query_device failed %d\n", err);
2426		goto out;
2427	}
2428
2429	for (port = 1; port <= MLX5_CAP_GEN(dev->mdev, num_ports); port++) {
2430		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
2431		if (err) {
2432			mlx5_ib_warn(dev, "query_port %d failed %d\n",
2433				     port, err);
2434			break;
2435		}
2436		dev->mdev->port_caps[port - 1].pkey_table_len =
2437						dprops->max_pkeys;
2438		dev->mdev->port_caps[port - 1].gid_table_len =
2439						pprops->gid_tbl_len;
2440		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
2441			    dprops->max_pkeys, pprops->gid_tbl_len);
2442	}
2443
2444out:
2445	kfree(pprops);
2446	kfree(dprops);
2447
2448	return err;
2449}
2450
2451static void destroy_umrc_res(struct mlx5_ib_dev *dev)
2452{
2453	int err;
2454
2455	err = mlx5_mr_cache_cleanup(dev);
2456	if (err)
2457		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
2458
2459	mlx5_ib_destroy_qp(dev->umrc.qp);
2460	ib_free_cq(dev->umrc.cq);
2461	ib_dealloc_pd(dev->umrc.pd);
2462}
2463
2464enum {
2465	MAX_UMR_WR = 128,
2466};
2467
2468static int create_umr_res(struct mlx5_ib_dev *dev)
2469{
2470	struct ib_qp_init_attr *init_attr = NULL;
2471	struct ib_qp_attr *attr = NULL;
2472	struct ib_pd *pd;
2473	struct ib_cq *cq;
2474	struct ib_qp *qp;
2475	int ret;
2476
2477	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
2478	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
2479	if (!attr || !init_attr) {
2480		ret = -ENOMEM;
2481		goto error_0;
2482	}
2483
2484	pd = ib_alloc_pd(&dev->ib_dev, 0);
2485	if (IS_ERR(pd)) {
2486		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
2487		ret = PTR_ERR(pd);
2488		goto error_0;
2489	}
2490
2491	cq = ib_alloc_cq(&dev->ib_dev, NULL, 128, 0, IB_POLL_SOFTIRQ);
2492	if (IS_ERR(cq)) {
2493		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
2494		ret = PTR_ERR(cq);
2495		goto error_2;
2496	}
2497
2498	init_attr->send_cq = cq;
2499	init_attr->recv_cq = cq;
2500	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
2501	init_attr->cap.max_send_wr = MAX_UMR_WR;
2502	init_attr->cap.max_send_sge = 1;
2503	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
2504	init_attr->port_num = 1;
2505	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
2506	if (IS_ERR(qp)) {
2507		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
2508		ret = PTR_ERR(qp);
2509		goto error_3;
2510	}
2511	qp->device     = &dev->ib_dev;
2512	qp->real_qp    = qp;
2513	qp->uobject    = NULL;
2514	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
2515
2516	attr->qp_state = IB_QPS_INIT;
2517	attr->port_num = 1;
2518	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
2519				IB_QP_PORT, NULL);
2520	if (ret) {
2521		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
2522		goto error_4;
2523	}
2524
2525	memset(attr, 0, sizeof(*attr));
2526	attr->qp_state = IB_QPS_RTR;
2527	attr->path_mtu = IB_MTU_256;
2528
2529	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2530	if (ret) {
2531		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
2532		goto error_4;
2533	}
2534
2535	memset(attr, 0, sizeof(*attr));
2536	attr->qp_state = IB_QPS_RTS;
2537	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
2538	if (ret) {
2539		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
2540		goto error_4;
2541	}
2542
2543	dev->umrc.qp = qp;
2544	dev->umrc.cq = cq;
2545	dev->umrc.pd = pd;
2546
2547	sema_init(&dev->umrc.sem, MAX_UMR_WR);
2548	ret = mlx5_mr_cache_init(dev);
2549	if (ret) {
2550		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
2551		goto error_4;
2552	}
2553
2554	kfree(attr);
2555	kfree(init_attr);
2556
2557	return 0;
2558
2559error_4:
2560	mlx5_ib_destroy_qp(qp);
2561
2562error_3:
2563	ib_free_cq(cq);
2564
2565error_2:
2566	ib_dealloc_pd(pd);
2567
2568error_0:
2569	kfree(attr);
2570	kfree(init_attr);
2571	return ret;
2572}
2573
2574static int create_dev_resources(struct mlx5_ib_resources *devr)
2575{
2576	struct ib_srq_init_attr attr;
2577	struct mlx5_ib_dev *dev;
2578	struct ib_cq_init_attr cq_attr = {.cqe = 1};
2579	int port;
2580	int ret = 0;
2581
2582	dev = container_of(devr, struct mlx5_ib_dev, devr);
2583
2584	mutex_init(&devr->mutex);
2585
2586	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
2587	if (IS_ERR(devr->p0)) {
2588		ret = PTR_ERR(devr->p0);
2589		goto error0;
2590	}
2591	devr->p0->device  = &dev->ib_dev;
2592	devr->p0->uobject = NULL;
2593	atomic_set(&devr->p0->usecnt, 0);
2594
2595	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
2596	if (IS_ERR(devr->c0)) {
2597		ret = PTR_ERR(devr->c0);
2598		goto error1;
2599	}
2600	devr->c0->device        = &dev->ib_dev;
2601	devr->c0->uobject       = NULL;
2602	devr->c0->comp_handler  = NULL;
2603	devr->c0->event_handler = NULL;
2604	devr->c0->cq_context    = NULL;
2605	atomic_set(&devr->c0->usecnt, 0);
2606
2607	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2608	if (IS_ERR(devr->x0)) {
2609		ret = PTR_ERR(devr->x0);
2610		goto error2;
2611	}
2612	devr->x0->device = &dev->ib_dev;
2613	devr->x0->inode = NULL;
2614	atomic_set(&devr->x0->usecnt, 0);
2615	mutex_init(&devr->x0->tgt_qp_mutex);
2616	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
2617
2618	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
2619	if (IS_ERR(devr->x1)) {
2620		ret = PTR_ERR(devr->x1);
2621		goto error3;
2622	}
2623	devr->x1->device = &dev->ib_dev;
2624	devr->x1->inode = NULL;
2625	atomic_set(&devr->x1->usecnt, 0);
2626	mutex_init(&devr->x1->tgt_qp_mutex);
2627	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
2628
2629	memset(&attr, 0, sizeof(attr));
2630	attr.attr.max_sge = 1;
2631	attr.attr.max_wr = 1;
2632	attr.srq_type = IB_SRQT_XRC;
2633	attr.ext.xrc.cq = devr->c0;
2634	attr.ext.xrc.xrcd = devr->x0;
2635
2636	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2637	if (IS_ERR(devr->s0)) {
2638		ret = PTR_ERR(devr->s0);
2639		goto error4;
2640	}
2641	devr->s0->device	= &dev->ib_dev;
2642	devr->s0->pd		= devr->p0;
2643	devr->s0->uobject       = NULL;
2644	devr->s0->event_handler = NULL;
2645	devr->s0->srq_context   = NULL;
2646	devr->s0->srq_type      = IB_SRQT_XRC;
2647	devr->s0->ext.xrc.xrcd	= devr->x0;
2648	devr->s0->ext.xrc.cq	= devr->c0;
2649	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
2650	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
2651	atomic_inc(&devr->p0->usecnt);
2652	atomic_set(&devr->s0->usecnt, 0);
2653
2654	memset(&attr, 0, sizeof(attr));
2655	attr.attr.max_sge = 1;
2656	attr.attr.max_wr = 1;
2657	attr.srq_type = IB_SRQT_BASIC;
2658	devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
2659	if (IS_ERR(devr->s1)) {
2660		ret = PTR_ERR(devr->s1);
2661		goto error5;
2662	}
2663	devr->s1->device	= &dev->ib_dev;
2664	devr->s1->pd		= devr->p0;
2665	devr->s1->uobject       = NULL;
2666	devr->s1->event_handler = NULL;
2667	devr->s1->srq_context   = NULL;
2668	devr->s1->srq_type      = IB_SRQT_BASIC;
2669	devr->s1->ext.xrc.cq	= devr->c0;
2670	atomic_inc(&devr->p0->usecnt);
2671	atomic_set(&devr->s0->usecnt, 0);
2672
2673	for (port = 0; port < ARRAY_SIZE(devr->ports); ++port) {
2674		INIT_WORK(&devr->ports[port].pkey_change_work,
2675			  pkey_change_handler);
2676		devr->ports[port].devr = devr;
2677	}
2678
2679	return 0;
2680
2681error5:
2682	mlx5_ib_destroy_srq(devr->s0);
2683error4:
2684	mlx5_ib_dealloc_xrcd(devr->x1);
2685error3:
2686	mlx5_ib_dealloc_xrcd(devr->x0);
2687error2:
2688	mlx5_ib_destroy_cq(devr->c0);
2689error1:
2690	mlx5_ib_dealloc_pd(devr->p0);
2691error0:
2692	return ret;
2693}
2694
2695static void destroy_dev_resources(struct mlx5_ib_resources *devr)
2696{
2697	struct mlx5_ib_dev *dev =
2698		container_of(devr, struct mlx5_ib_dev, devr);
2699	int port;
2700
2701	mlx5_ib_destroy_srq(devr->s1);
2702	mlx5_ib_destroy_srq(devr->s0);
2703	mlx5_ib_dealloc_xrcd(devr->x0);
2704	mlx5_ib_dealloc_xrcd(devr->x1);
2705	mlx5_ib_destroy_cq(devr->c0);
2706	mlx5_ib_dealloc_pd(devr->p0);
2707
2708	/* Make sure no change P_Key work items are still executing */
2709	for (port = 0; port < dev->num_ports; ++port)
2710		cancel_work_sync(&devr->ports[port].pkey_change_work);
2711}
2712
2713static u32 get_core_cap_flags(struct ib_device *ibdev)
2714{
2715	struct mlx5_ib_dev *dev = to_mdev(ibdev);
2716	enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1);
2717	u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type);
2718	u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version);
2719	u32 ret = 0;
2720
2721	if (ll == IB_LINK_LAYER_INFINIBAND)
2722		return RDMA_CORE_PORT_IBA_IB;
2723
2724	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP))
2725		return 0;
2726
2727	if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP))
2728		return 0;
2729
2730	if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP)
2731		ret |= RDMA_CORE_PORT_IBA_ROCE;
2732
2733	if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP)
2734		ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP;
2735
2736	return ret;
2737}
2738
2739static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num,
2740			       struct ib_port_immutable *immutable)
2741{
2742	struct ib_port_attr attr;
2743	int err;
2744
2745	err = mlx5_ib_query_port(ibdev, port_num, &attr);
2746	if (err)
2747		return err;
2748
2749	immutable->pkey_tbl_len = attr.pkey_tbl_len;
2750	immutable->gid_tbl_len = attr.gid_tbl_len;
2751	immutable->core_cap_flags = get_core_cap_flags(ibdev);
2752	immutable->max_mad_size = IB_MGMT_MAD_SIZE;
2753
2754	return 0;
2755}
2756
2757static void get_dev_fw_str(struct ib_device *ibdev, char *str,
2758			   size_t str_len)
2759{
2760	struct mlx5_ib_dev *dev =
2761		container_of(ibdev, struct mlx5_ib_dev, ib_dev);
2762	snprintf(str, str_len, "%d.%d.%04d", fw_rev_maj(dev->mdev),
2763		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
2764}
2765
2766static int mlx5_roce_lag_init(struct mlx5_ib_dev *dev)
2767{
2768	return 0;
2769}
2770
2771static void mlx5_roce_lag_cleanup(struct mlx5_ib_dev *dev)
2772{
2773}
2774
2775static void mlx5_remove_roce_notifier(struct mlx5_ib_dev *dev)
2776{
2777	if (dev->roce.nb.notifier_call) {
2778		unregister_netdevice_notifier(&dev->roce.nb);
2779		dev->roce.nb.notifier_call = NULL;
2780	}
2781}
2782
2783static int mlx5_enable_roce(struct mlx5_ib_dev *dev)
2784{
2785	VNET_ITERATOR_DECL(vnet_iter);
2786	struct net_device *idev;
2787	int err;
2788
2789	/* Check if mlx5en net device already exists */
2790	VNET_LIST_RLOCK();
2791	VNET_FOREACH(vnet_iter) {
2792		IFNET_RLOCK();
2793		CURVNET_SET_QUIET(vnet_iter);
2794		TAILQ_FOREACH(idev, &V_ifnet, if_link) {
2795			/* check if network interface belongs to mlx5en */
2796			if (!mlx5_netdev_match(idev, dev->mdev, "mce"))
2797				continue;
2798			write_lock(&dev->roce.netdev_lock);
2799			dev->roce.netdev = idev;
2800			write_unlock(&dev->roce.netdev_lock);
2801		}
2802		CURVNET_RESTORE();
2803		IFNET_RUNLOCK();
2804	}
2805	VNET_LIST_RUNLOCK();
2806
2807	dev->roce.nb.notifier_call = mlx5_netdev_event;
2808	err = register_netdevice_notifier(&dev->roce.nb);
2809	if (err) {
2810		dev->roce.nb.notifier_call = NULL;
2811		return err;
2812	}
2813
2814	err = mlx5_nic_vport_enable_roce(dev->mdev);
2815	if (err)
2816		goto err_unregister_netdevice_notifier;
2817
2818	err = mlx5_roce_lag_init(dev);
2819	if (err)
2820		goto err_disable_roce;
2821
2822	return 0;
2823
2824err_disable_roce:
2825	mlx5_nic_vport_disable_roce(dev->mdev);
2826
2827err_unregister_netdevice_notifier:
2828	mlx5_remove_roce_notifier(dev);
2829	return err;
2830}
2831
2832static void mlx5_disable_roce(struct mlx5_ib_dev *dev)
2833{
2834	mlx5_roce_lag_cleanup(dev);
2835	mlx5_nic_vport_disable_roce(dev->mdev);
2836}
2837
2838static void mlx5_ib_dealloc_q_port_counter(struct mlx5_ib_dev *dev, u8 port_num)
2839{
2840	mlx5_vport_dealloc_q_counter(dev->mdev,
2841				     MLX5_INTERFACE_PROTOCOL_IB,
2842				     dev->port[port_num].q_cnt_id);
2843	dev->port[port_num].q_cnt_id = 0;
2844}
2845
2846static void mlx5_ib_dealloc_q_counters(struct mlx5_ib_dev *dev)
2847{
2848	unsigned int i;
2849
2850	for (i = 0; i < dev->num_ports; i++)
2851		mlx5_ib_dealloc_q_port_counter(dev, i);
2852}
2853
2854static int mlx5_ib_alloc_q_counters(struct mlx5_ib_dev *dev)
2855{
2856	int i;
2857	int ret;
2858
2859	for (i = 0; i < dev->num_ports; i++) {
2860		ret = mlx5_vport_alloc_q_counter(dev->mdev,
2861						 MLX5_INTERFACE_PROTOCOL_IB,
2862						 &dev->port[i].q_cnt_id);
2863		if (ret) {
2864			mlx5_ib_warn(dev,
2865				     "couldn't allocate queue counter for port %d, err %d\n",
2866				     i + 1, ret);
2867			goto dealloc_counters;
2868		}
2869	}
2870
2871	return 0;
2872
2873dealloc_counters:
2874	while (--i >= 0)
2875		mlx5_ib_dealloc_q_port_counter(dev, i);
2876
2877	return ret;
2878}
2879
2880static const char * const names[] = {
2881	"rx_write_requests",
2882	"rx_read_requests",
2883	"rx_atomic_requests",
2884	"out_of_buffer",
2885	"out_of_sequence",
2886	"duplicate_request",
2887	"rnr_nak_retry_err",
2888	"packet_seq_err",
2889	"implied_nak_seq_err",
2890	"local_ack_timeout_err",
2891};
2892
2893static const size_t stats_offsets[] = {
2894	MLX5_BYTE_OFF(query_q_counter_out, rx_write_requests),
2895	MLX5_BYTE_OFF(query_q_counter_out, rx_read_requests),
2896	MLX5_BYTE_OFF(query_q_counter_out, rx_atomic_requests),
2897	MLX5_BYTE_OFF(query_q_counter_out, out_of_buffer),
2898	MLX5_BYTE_OFF(query_q_counter_out, out_of_sequence),
2899	MLX5_BYTE_OFF(query_q_counter_out, duplicate_request),
2900	MLX5_BYTE_OFF(query_q_counter_out, rnr_nak_retry_err),
2901	MLX5_BYTE_OFF(query_q_counter_out, packet_seq_err),
2902	MLX5_BYTE_OFF(query_q_counter_out, implied_nak_seq_err),
2903	MLX5_BYTE_OFF(query_q_counter_out, local_ack_timeout_err),
2904};
2905
2906static struct rdma_hw_stats *mlx5_ib_alloc_hw_stats(struct ib_device *ibdev,
2907						    u8 port_num)
2908{
2909	BUILD_BUG_ON(ARRAY_SIZE(names) != ARRAY_SIZE(stats_offsets));
2910
2911	/* We support only per port stats */
2912	if (port_num == 0)
2913		return NULL;
2914
2915	return rdma_alloc_hw_stats_struct(names, ARRAY_SIZE(names),
2916					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
2917}
2918
2919static int mlx5_ib_get_hw_stats(struct ib_device *ibdev,
2920				struct rdma_hw_stats *stats,
2921				u8 port, int index)
2922{
2923	struct mlx5_ib_dev *dev = to_mdev(ibdev);
2924	int outlen = MLX5_ST_SZ_BYTES(query_q_counter_out);
2925	void *out;
2926	__be32 val;
2927	int ret;
2928	int i;
2929
2930	if (!port || !stats)
2931		return -ENOSYS;
2932
2933	out = mlx5_vzalloc(outlen);
2934	if (!out)
2935		return -ENOMEM;
2936
2937	ret = mlx5_vport_query_q_counter(dev->mdev,
2938					dev->port[port - 1].q_cnt_id, 0,
2939					out, outlen);
2940	if (ret)
2941		goto free;
2942
2943	for (i = 0; i < ARRAY_SIZE(names); i++) {
2944		val = *(__be32 *)(out + stats_offsets[i]);
2945		stats->value[i] = (u64)be32_to_cpu(val);
2946	}
2947free:
2948	kvfree(out);
2949	return ARRAY_SIZE(names);
2950}
2951
2952static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
2953{
2954	struct mlx5_ib_dev *dev;
2955	enum rdma_link_layer ll;
2956	int port_type_cap;
2957	int err;
2958	int i;
2959
2960	port_type_cap = MLX5_CAP_GEN(mdev, port_type);
2961	ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap);
2962
2963	if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce))
2964		return NULL;
2965
2966	printk_once(KERN_INFO "%s", mlx5_version);
2967
2968	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
2969	if (!dev)
2970		return NULL;
2971
2972	dev->mdev = mdev;
2973
2974	dev->port = kcalloc(MLX5_CAP_GEN(mdev, num_ports), sizeof(*dev->port),
2975			    GFP_KERNEL);
2976	if (!dev->port)
2977		goto err_dealloc;
2978
2979	rwlock_init(&dev->roce.netdev_lock);
2980	err = get_port_caps(dev);
2981	if (err)
2982		goto err_free_port;
2983
2984	if (mlx5_use_mad_ifc(dev))
2985		get_ext_port_caps(dev);
2986
2987	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
2988
2989	snprintf(dev->ib_dev.name, IB_DEVICE_NAME_MAX, "mlx5_%d", device_get_unit(mdev->pdev->dev.bsddev));
2990	dev->ib_dev.owner		= THIS_MODULE;
2991	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
2992	dev->ib_dev.local_dma_lkey	= 0 /* not supported for now */;
2993	dev->num_ports		= MLX5_CAP_GEN(mdev, num_ports);
2994	dev->ib_dev.phys_port_cnt     = dev->num_ports;
2995	dev->ib_dev.num_comp_vectors    =
2996		dev->mdev->priv.eq_table.num_comp_vectors;
2997	dev->ib_dev.dma_device	= &mdev->pdev->dev;
2998
2999	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
3000	dev->ib_dev.uverbs_cmd_mask	=
3001		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
3002		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
3003		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
3004		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
3005		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
3006		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
3007		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
3008		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
3009		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
3010		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
3011		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
3012		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
3013		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
3014		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
3015		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
3016		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
3017		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
3018		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
3019		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
3020		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
3021		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
3022		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
3023		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
3024		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
3025		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
3026		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
3027	dev->ib_dev.uverbs_ex_cmd_mask =
3028		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE)	|
3029		(1ull << IB_USER_VERBS_EX_CMD_CREATE_CQ)	|
3030		(1ull << IB_USER_VERBS_EX_CMD_CREATE_QP);
3031
3032	dev->ib_dev.query_device	= mlx5_ib_query_device;
3033	dev->ib_dev.query_port		= mlx5_ib_query_port;
3034	dev->ib_dev.get_link_layer	= mlx5_ib_port_link_layer;
3035	if (ll == IB_LINK_LAYER_ETHERNET)
3036		dev->ib_dev.get_netdev	= mlx5_ib_get_netdev;
3037	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
3038	dev->ib_dev.add_gid		= mlx5_ib_add_gid;
3039	dev->ib_dev.del_gid		= mlx5_ib_del_gid;
3040	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
3041	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
3042	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
3043	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
3044	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
3045	dev->ib_dev.mmap		= mlx5_ib_mmap;
3046	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
3047	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
3048	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
3049	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
3050	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
3051	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
3052	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
3053	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
3054	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
3055	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
3056	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
3057	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
3058	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
3059	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
3060	dev->ib_dev.post_send		= mlx5_ib_post_send;
3061	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
3062	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
3063	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
3064	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
3065	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
3066	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
3067	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
3068	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
3069	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
3070	dev->ib_dev.rereg_user_mr	= mlx5_ib_rereg_user_mr;
3071	dev->ib_dev.reg_phys_mr		= mlx5_ib_reg_phys_mr;
3072	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
3073	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
3074	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
3075	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
3076	dev->ib_dev.alloc_mr		= mlx5_ib_alloc_mr;
3077	dev->ib_dev.map_mr_sg		= mlx5_ib_map_mr_sg;
3078	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
3079	dev->ib_dev.get_port_immutable  = mlx5_port_immutable;
3080	dev->ib_dev.get_dev_fw_str      = get_dev_fw_str;
3081	if (mlx5_core_is_pf(mdev)) {
3082		dev->ib_dev.get_vf_config	= mlx5_ib_get_vf_config;
3083		dev->ib_dev.set_vf_link_state	= mlx5_ib_set_vf_link_state;
3084		dev->ib_dev.get_vf_stats	= mlx5_ib_get_vf_stats;
3085		dev->ib_dev.set_vf_guid		= mlx5_ib_set_vf_guid;
3086	}
3087
3088	mlx5_ib_internal_fill_odp_caps(dev);
3089
3090	if (MLX5_CAP_GEN(mdev, imaicl)) {
3091		dev->ib_dev.alloc_mw		= mlx5_ib_alloc_mw;
3092		dev->ib_dev.dealloc_mw		= mlx5_ib_dealloc_mw;
3093		dev->ib_dev.uverbs_cmd_mask |=
3094			(1ull << IB_USER_VERBS_CMD_ALLOC_MW)	|
3095			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
3096	}
3097
3098	if (MLX5_CAP_GEN(dev->mdev, out_of_seq_cnt) &&
3099	    MLX5_CAP_GEN(dev->mdev, retransmission_q_counters)) {
3100		dev->ib_dev.get_hw_stats	= mlx5_ib_get_hw_stats;
3101		dev->ib_dev.alloc_hw_stats	= mlx5_ib_alloc_hw_stats;
3102	}
3103
3104	if (MLX5_CAP_GEN(mdev, xrc)) {
3105		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
3106		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
3107		dev->ib_dev.uverbs_cmd_mask |=
3108			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
3109			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
3110	}
3111
3112	if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) ==
3113	    IB_LINK_LAYER_ETHERNET) {
3114		dev->ib_dev.create_flow	= mlx5_ib_create_flow;
3115		dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow;
3116		dev->ib_dev.create_wq	 = mlx5_ib_create_wq;
3117		dev->ib_dev.modify_wq	 = mlx5_ib_modify_wq;
3118		dev->ib_dev.destroy_wq	 = mlx5_ib_destroy_wq;
3119		dev->ib_dev.create_rwq_ind_table = mlx5_ib_create_rwq_ind_table;
3120		dev->ib_dev.destroy_rwq_ind_table = mlx5_ib_destroy_rwq_ind_table;
3121		dev->ib_dev.uverbs_ex_cmd_mask |=
3122			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
3123			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW) |
3124			(1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
3125			(1ull << IB_USER_VERBS_EX_CMD_MODIFY_WQ) |
3126			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_WQ) |
3127			(1ull << IB_USER_VERBS_EX_CMD_CREATE_RWQ_IND_TBL) |
3128			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_RWQ_IND_TBL);
3129	}
3130	err = init_node_data(dev);
3131	if (err)
3132		goto err_free_port;
3133
3134	mutex_init(&dev->flow_db.lock);
3135	mutex_init(&dev->cap_mask_mutex);
3136	INIT_LIST_HEAD(&dev->qp_list);
3137	spin_lock_init(&dev->reset_flow_resource_lock);
3138
3139	if (ll == IB_LINK_LAYER_ETHERNET) {
3140		err = mlx5_enable_roce(dev);
3141		if (err)
3142			goto err_free_port;
3143	}
3144
3145	err = create_dev_resources(&dev->devr);
3146	if (err)
3147		goto err_disable_roce;
3148
3149	err = mlx5_ib_odp_init_one(dev);
3150	if (err)
3151		goto err_rsrc;
3152
3153	err = mlx5_ib_alloc_q_counters(dev);
3154	if (err)
3155		goto err_odp;
3156
3157	err = ib_register_device(&dev->ib_dev, NULL);
3158	if (err)
3159		goto err_q_cnt;
3160
3161	err = create_umr_res(dev);
3162	if (err)
3163		goto err_dev;
3164
3165	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
3166		err = device_create_file(&dev->ib_dev.dev,
3167					 mlx5_class_attributes[i]);
3168		if (err)
3169			goto err_umrc;
3170	}
3171
3172	err = mlx5_ib_init_congestion(dev);
3173	if (err)
3174		goto err_umrc;
3175
3176	dev->ib_active = true;
3177
3178	return dev;
3179
3180err_umrc:
3181	destroy_umrc_res(dev);
3182
3183err_dev:
3184	ib_unregister_device(&dev->ib_dev);
3185
3186err_q_cnt:
3187	mlx5_ib_dealloc_q_counters(dev);
3188
3189err_odp:
3190	mlx5_ib_odp_remove_one(dev);
3191
3192err_rsrc:
3193	destroy_dev_resources(&dev->devr);
3194
3195err_disable_roce:
3196	if (ll == IB_LINK_LAYER_ETHERNET) {
3197		mlx5_disable_roce(dev);
3198		mlx5_remove_roce_notifier(dev);
3199	}
3200
3201err_free_port:
3202	kfree(dev->port);
3203
3204err_dealloc:
3205	ib_dealloc_device((struct ib_device *)dev);
3206
3207	return NULL;
3208}
3209
3210static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
3211{
3212	struct mlx5_ib_dev *dev = context;
3213	enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1);
3214
3215	mlx5_ib_cleanup_congestion(dev);
3216	mlx5_remove_roce_notifier(dev);
3217	ib_unregister_device(&dev->ib_dev);
3218	mlx5_ib_dealloc_q_counters(dev);
3219	destroy_umrc_res(dev);
3220	mlx5_ib_odp_remove_one(dev);
3221	destroy_dev_resources(&dev->devr);
3222	if (ll == IB_LINK_LAYER_ETHERNET)
3223		mlx5_disable_roce(dev);
3224	kfree(dev->port);
3225	ib_dealloc_device(&dev->ib_dev);
3226}
3227
3228static struct mlx5_interface mlx5_ib_interface = {
3229	.add            = mlx5_ib_add,
3230	.remove         = mlx5_ib_remove,
3231	.event          = mlx5_ib_event,
3232	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
3233};
3234
3235static int __init mlx5_ib_init(void)
3236{
3237	int err;
3238
3239	if (deprecated_prof_sel != 2)
3240		pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
3241
3242	err = mlx5_ib_odp_init();
3243	if (err)
3244		return err;
3245
3246	err = mlx5_register_interface(&mlx5_ib_interface);
3247	if (err)
3248		goto clean_odp;
3249
3250	return err;
3251
3252clean_odp:
3253	mlx5_ib_odp_cleanup();
3254	return err;
3255}
3256
3257static void __exit mlx5_ib_cleanup(void)
3258{
3259	mlx5_unregister_interface(&mlx5_ib_interface);
3260	mlx5_ib_odp_cleanup();
3261}
3262
3263module_init_order(mlx5_ib_init, SI_ORDER_THIRD);
3264module_exit_order(mlx5_ib_cleanup, SI_ORDER_THIRD);
3265