1/*
2
3 * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
4 * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses.  You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 *     Redistribution and use in source and binary forms, with or
13 *     without modification, are permitted provided that the following
14 *     conditions are met:
15 *
16 *      - Redistributions of source code must retain the above
17 *        copyright notice, this list of conditions and the following
18 *        disclaimer.
19 *
20 *      - Redistributions in binary form must reproduce the above
21 *        copyright notice, this list of conditions and the following
22 *        disclaimer in the documentation and/or other materials
23 *        provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33
34
35 #include <linux/log2.h>
36 #include <linux/slab.h>
37 #include <linux/netdevice.h>
38 #include <linux/bitops.h>
39
40 #include <rdma/ib_cache.h>
41 */
42#include <rdma/ib_pack.h>
43/*
44 #include <rdma/ib_addr.h>
45 #include <rdma/ib_mad.h>
46 */
47#include <linux/mlx4/qp.h>
48/*
49 #include <linux/mlx4/driver.h>
50 #include <linux/io.h>
51 */
52#include <linux/err.h>
53#include <linux/log2.h>
54#include <linux/gfp.h>
55#include <linux/compiler.h>
56
57#include <netinet/in.h>
58#include <asm/byteorder.h>
59
60#include <debug.h>
61/*
62 #ifndef __linux__
63 #define asm __asm
64 #endif
65 */
66#include "mlx4_ib.h"
67#include "user.h"
68
69enum {
70	MLX4_IB_ACK_REQ_FREQ = 8,
71};
72
73enum {
74	MLX4_IB_DEFAULT_SCHED_QUEUE = 0x83,
75	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE = 0x3f,
76	MLX4_IB_LINK_TYPE_IB = 0,
77	MLX4_IB_LINK_TYPE_ETH = 1
78};
79
80enum {
81
82	/** Largest possible UD header: send with GRH and immediate
83	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
84	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
85	 * biggest case)*/
86
87	MLX4_IB_UD_HEADER_SIZE = 82, MLX4_IB_LSO_HEADER_SPARE = 128,
88};
89
90enum {
91	MLX4_IB_IBOE_ETHERTYPE = 0x8915
92};
93
94struct mlx4_ib_sqp {
95	struct mlx4_ib_qp qp;
96	int pkey_index;
97	u32 qkey;
98	u32 send_psn;
99	struct ib_ud_header ud_header;
100	u8 header_buf[MLX4_IB_UD_HEADER_SIZE];
101};
102
103enum {
104	MLX4_IB_MIN_SQ_STRIDE = 6, MLX4_IB_CACHE_LINE_SIZE = 64,
105};
106
107enum {
108	MLX4_RAW_QP_MTU = 7, MLX4_RAW_QP_MSGMAX = 31,
109};
110
111static const __be32 mlx4_ib_opcode[] = { [IB_WR_SEND] = cpu_to_be32(
112		MLX4_OPCODE_SEND), [IB_WR_LSO] = cpu_to_be32(MLX4_OPCODE_LSO),
113		[IB_WR_SEND_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_SEND_IMM),
114		[IB_WR_RDMA_WRITE] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
115		[IB_WR_RDMA_WRITE_WITH_IMM] = cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
116		[IB_WR_RDMA_READ] = cpu_to_be32(MLX4_OPCODE_RDMA_READ),
117		[IB_WR_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
118		[IB_WR_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
119		[IB_WR_SEND_WITH_INV] = cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
120		[IB_WR_LOCAL_INV] = cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
121		[IB_WR_FAST_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR),
122		[IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(
123				MLX4_OPCODE_MASKED_ATOMIC_CS),
124		[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(
125				MLX4_OPCODE_MASKED_ATOMIC_FA), };
126
127#ifndef wc_wmb
128#if defined(__i386__)
129#define wc_wmb() __asm volatile("lock; addl $0,0(%%esp) " ::: "memory")
130#elif defined(__x86_64__)
131#define wc_wmb() __asm volatile("sfence" ::: "memory")
132#elif defined(__ia64__)
133#define wc_wmb() __asm volatile("fwb" ::: "memory")
134#else
135#define wc_wmb() wmb()
136#endif
137#endif
138
139static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) {
140	return container_of(mqp, struct mlx4_ib_sqp, qp);
141}
142/*
143 static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) {
144 if (!mlx4_is_master(dev->dev))
145 return 0;
146
147 return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn
148 && qp->mqp.qpn
149 < dev->dev->phys_caps.base_tunnel_sqpn + 8 * MLX4_MFUNC_MAX;
150 }
151 */
152static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) {
153	int proxy_sqp = 0;
154	int real_sqp = 0;
155	int i;
156	/*PPF or Native -- real SQP*/
157	real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev))
158			&& qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn
159			&& qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
160	if (real_sqp)
161		return 1;
162	/*VF or PF -- proxy SQP*/
163	if (mlx4_is_mfunc(dev->dev)) {
164		for (i = 0; i < dev->dev->caps.num_ports; i++) {
165			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]
166					|| qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) {
167				proxy_sqp = 1;
168				break;
169			}
170		}
171	}
172	return proxy_sqp;
173}
174
175/*used for INIT/CLOSE port logic*/
176static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) {
177	int proxy_qp0 = 0;
178	int real_qp0 = 0;
179	int i;
180	/*PPF or Native -- real QP0*/
181	real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev))
182			&& qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn
183			&& qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
184	if (real_qp0)
185		return 1;
186	/*VF or PF -- proxy QP0*/
187	if (mlx4_is_mfunc(dev->dev)) {
188		for (i = 0; i < dev->dev->caps.num_ports; i++) {
189			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) {
190				proxy_qp0 = 1;
191				break;
192			}
193		}
194	}
195	return proxy_qp0;
196}
197
198static void *get_wqe(struct mlx4_ib_qp *qp, int offset) {
199	return mlx4_buf_offset(&qp->buf, offset);
200}
201
202static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n) {
203	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
204}
205
206static void *get_send_wqe(struct mlx4_ib_qp *qp, int n) {
207	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
208}
209/*
210
211 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
212 * first four bytes of every 64 byte chunk with
213 *     0x7FFFFFF | (invalid_ownership_value << 31).
214 *
215 * When the max work request size is less than or equal to the WQE
216 * basic block size, as an optimization, we can stamp all WQEs with
217 * 0xffffffff, and skip the very first chunk of each WQE.
218 */
219static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size) {
220	__be32 *wqe;
221	int i;
222	int s;
223	int ind;
224	void *buf;
225	__be32 stamp;
226	struct mlx4_wqe_ctrl_seg *ctrl;
227
228	if (qp->sq_max_wqes_per_wr > 1) {
229		s = roundup(size, 1U << qp->sq.wqe_shift);
230		for (i = 0; i < s; i += 64) {
231			ind = (i >> qp->sq.wqe_shift) + n;
232			stamp = ind & qp->sq.wqe_cnt ?
233					cpu_to_be32(0x7fffffff) : cpu_to_be32(0xffffffff);
234			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
235			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
236			*wqe = stamp;
237		}
238	} else {
239		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
240		s = (ctrl->fence_size & 0x3f) << 4;
241		for (i = 64; i < s; i += 64) {
242			wqe = buf + i;
243			*wqe = cpu_to_be32(0xffffffff);
244		}
245	}
246}
247
248static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size) {
249	struct mlx4_wqe_ctrl_seg *ctrl;
250	struct mlx4_wqe_inline_seg *inl;
251	void *wqe;
252	int s;
253
254	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
255	s = sizeof(struct mlx4_wqe_ctrl_seg);
256
257	if (qp->ibqp.qp_type == IB_QPT_UD) {
258		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
259		struct mlx4_av *av = (struct mlx4_av *) dgram->av;
260		memset(dgram, 0, sizeof *dgram);
261		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
262		s += sizeof(struct mlx4_wqe_datagram_seg);
263	}
264
265	/*Pad the remainder of the WQE with an inline data segment.*/
266	if (size > s) {
267		inl = wqe + s;
268		inl->byte_count = cpu_to_be32(1U << 31 | (size - s - sizeof *inl));
269	}
270	ctrl->srcrb_flags = 0;
271	ctrl->fence_size = size / 16;
272
273	/** Make sure descriptor is fully written before setting ownership bit
274	 * (because HW can start executing as soon as we do).*/
275
276	wmb();
277
278	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC)
279			| (n & qp->sq.wqe_cnt ? cpu_to_be32(1U << 31) : 0);
280
281	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
282}
283/*
284 Post NOP WQE to prevent wrap-around in the middle of WR
285 */
286static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind) {
287	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
288	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
289		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
290		ind += s;
291	}
292	return ind;
293}
294
295static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type) {
296	struct ib_event event;
297	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
298
299	if (type == MLX4_EVENT_TYPE_PATH_MIG)
300		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
301
302	if (ibqp->event_handler) {
303		event.device = ibqp->device;
304		event.element.qp = ibqp;
305		switch (type) {
306		case MLX4_EVENT_TYPE_PATH_MIG:
307			event.event = IB_EVENT_PATH_MIG;
308			break;
309		case MLX4_EVENT_TYPE_COMM_EST:
310			event.event = IB_EVENT_COMM_EST;
311			break;
312		case MLX4_EVENT_TYPE_SQ_DRAINED:
313			event.event = IB_EVENT_SQ_DRAINED;
314			break;
315		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
316			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
317			break;
318		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
319			event.event = IB_EVENT_QP_FATAL;
320			break;
321		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
322			event.event = IB_EVENT_PATH_MIG_ERR;
323			break;
324		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
325			event.event = IB_EVENT_QP_REQ_ERR;
326			break;
327		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
328			event.event = IB_EVENT_QP_ACCESS_ERR;
329			break;
330		default:
331			MLX4_WARN("Unexpected event type %d "
332					"on QP %06x\n", type, qp->qpn);
333			return;
334		}
335
336		ibqp->event_handler(&event, ibqp->qp_context);
337	}
338}
339
340static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags) {
341
342	/** UD WQEs must have a datagram segment.
343	 * RC and UC WQEs might have a remote address segment.
344	 * MLX WQEs need two extra inline data segments (for the UD
345	 * header and space for the ICRC).*/
346
347	switch (type) {
348	case MLX4_IB_QPT_UD:
349		return sizeof(struct mlx4_wqe_ctrl_seg)
350				+ sizeof(struct mlx4_wqe_datagram_seg)
351				+ ((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
352	case MLX4_IB_QPT_PROXY_SMI_OWNER:
353	case MLX4_IB_QPT_PROXY_SMI:
354	case MLX4_IB_QPT_PROXY_GSI:
355		return sizeof(struct mlx4_wqe_ctrl_seg)
356				+ sizeof(struct mlx4_wqe_datagram_seg) + 64;
357	case MLX4_IB_QPT_TUN_SMI_OWNER:
358	case MLX4_IB_QPT_TUN_GSI:
359		return sizeof(struct mlx4_wqe_ctrl_seg)
360				+ sizeof(struct mlx4_wqe_datagram_seg);
361
362	case MLX4_IB_QPT_UC:
363		return sizeof(struct mlx4_wqe_ctrl_seg)
364				+ sizeof(struct mlx4_wqe_raddr_seg);
365	case MLX4_IB_QPT_RC:
366		return sizeof(struct mlx4_wqe_ctrl_seg)
367				+ sizeof(struct mlx4_wqe_masked_atomic_seg)
368				+ sizeof(struct mlx4_wqe_raddr_seg);
369	case MLX4_IB_QPT_SMI:
370	case MLX4_IB_QPT_GSI:
371		return sizeof(struct mlx4_wqe_ctrl_seg)
372				+ ALIGN(
373						MLX4_IB_UD_HEADER_SIZE + DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE, MLX4_INLINE_ALIGN) * sizeof(struct mlx4_wqe_inline_seg),
374						sizeof(struct mlx4_wqe_data_seg))
375				+ ALIGN(4 + sizeof(struct mlx4_wqe_inline_seg),
376						sizeof(struct mlx4_wqe_data_seg));
377	default:
378		return sizeof(struct mlx4_wqe_ctrl_seg);
379	}
380}
381
382static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
383		int is_user, int has_rq, struct mlx4_ib_qp *qp) {
384	/*Sanity check RQ size before proceeding*/
385	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE
386			|| cap->max_recv_sge
387					> min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
388		return -EINVAL;
389
390	if (!has_rq) {
391		if (cap->max_recv_wr)
392			return -EINVAL;
393
394		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
395	} else {
396		/*HW requires >= 1 RQ entry with >= 1 gather entry*/
397		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
398			return -EINVAL;
399
400		qp->rq.wqe_cnt = roundup_pow_of_two(max(1U, cap->max_recv_wr));
401		qp->rq.max_gs = roundup_pow_of_two(max(1U, cap->max_recv_sge));
402		qp->rq.wqe_shift = ilog2(
403				qp->rq.max_gs * sizeof(struct mlx4_wqe_data_seg));
404	}
405
406	/*leave userspace return values as they were, so as not to break ABI*/
407	if (is_user) {
408		cap->max_recv_wr = qp->rq.max_post = qp->rq.wqe_cnt;
409		cap->max_recv_sge = qp->rq.max_gs;
410	} else {
411		cap->max_recv_wr = qp->rq.max_post = min(
412				dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
413		cap->max_recv_sge = min(qp->rq.max_gs,
414				min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg));
415	}
416
417	return 0;
418}
419
420static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
421		enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp) {
422	int s;
423
424	/*Sanity check SQ size before proceeding*/
425	if (cap->max_send_wr > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE)
426			|| cap->max_send_sge
427					> min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg)
428			|| cap->max_inline_data + send_wqe_overhead(type, qp->flags)
429					+ sizeof(struct mlx4_wqe_inline_seg)
430					> dev->dev->caps.max_sq_desc_sz)
431		return -EINVAL;
432
433	/** For MLX transport we need 2 extra S/G entries:
434	 * one for the header and one for the checksum at the end*/
435
436	if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI
437			|| type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
438			&& cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
439		return -EINVAL;
440
441	s = max(cap->max_send_sge * sizeof(struct mlx4_wqe_data_seg),
442			cap->max_inline_data + sizeof(struct mlx4_wqe_inline_seg))
443			+ send_wqe_overhead(type, qp->flags);
444
445	if (s > dev->dev->caps.max_sq_desc_sz)
446		return -EINVAL;
447
448	/** Hermon supports shrinking WQEs, such that a single work
449	 * request can include multiple units of 1 << wqe_shift.  This
450	 * way, work requests can differ in size, and do not have to
451	 * be a power of 2 in size, saving memory and speeding up send
452	 * WR posting.  Unfortunately, if we do this then the
453	 * wqe_index field in CQEs can't be used to look up the WR ID
454	 * anymore, so we do this only if selective signaling is off.
455	 *
456	 * Further, on 32-bit platforms, we can't use vmap() to make
457	 * the QP buffer virtually contiguous.  Thus we have to use
458	 * constant-sized WRs to make sure a WR is always fully within
459	 * a single page-sized chunk.
460	 *
461	 * Finally, we use NOP work requests to pad the end of the
462	 * work queue, to avoid wrap-around in the middle of WR.  We
463	 * set NEC bit to avoid getting completions with error for
464	 * these NOP WRs, but since NEC is only supported starting
465	 * with firmware 2.2.232, we use constant-sized WRs for older
466	 * firmware.
467	 *
468	 * And, since MLX QPs only support SEND, we use constant-sized
469	 * WRs in this case.
470	 *
471	 * We look for the smallest value of wqe_shift such that the
472	 * resulting number of wqes does not exceed device
473	 * capabilities.
474	 *
475	 * We set WQE size to at least 64 bytes, this way stamping
476	 * invalidates each WQE.*/
477
478	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC && qp->sq_signal_bits
479			&& BITS_PER_LONG == 64 && type != MLX4_IB_QPT_SMI
480			&& type != MLX4_IB_QPT_GSI
481			&& !(type
482					& (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI
483							| MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
484		qp->sq.wqe_shift = ilog2(64);
485	else
486		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
487
488	for (;;) {
489		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
490
491		/** We need to leave 2 KB + 1 WR of headroom in the SQ to
492		 * allow HW to prefetch.*/
493
494		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
495		qp->sq.wqe_cnt = roundup_pow_of_two(
496				cap->max_send_wr * qp->sq_max_wqes_per_wr + qp->sq_spare_wqes);
497
498		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
499			break;
500
501		if (qp->sq_max_wqes_per_wr <= 1)
502			return -EINVAL;
503
504		++qp->sq.wqe_shift;
505	}
506
507	qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
508			(qp->sq_max_wqes_per_wr << qp->sq.wqe_shift))
509			- send_wqe_overhead(type, qp->flags))
510			/ sizeof(struct mlx4_wqe_data_seg);
511
512	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift)
513			+ (qp->sq.wqe_cnt << qp->sq.wqe_shift);
514	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
515		qp->rq.offset = 0;
516		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
517	} else {
518		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
519		qp->sq.offset = 0;
520	}
521
522	cap->max_send_wr = qp->sq.max_post = (qp->sq.wqe_cnt - qp->sq_spare_wqes)
523			/ qp->sq_max_wqes_per_wr;
524	cap->max_send_sge = min(qp->sq.max_gs,
525			min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg));
526	qp->max_inline_data = cap->max_inline_data;
527
528	return 0;
529}
530/*
531 static int set_user_sq_size(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
532 struct mlx4_ib_create_qp *ucmd) {
533 Sanity check SQ size before proceeding
534 if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes
535 || ucmd->log_sq_stride
536 > ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz))
537 || ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
538 return -EINVAL;
539
540 qp->sq.wqe_cnt = 1 << ucmd->log_sq_bb_count;
541 qp->sq.wqe_shift = ucmd->log_sq_stride;
542
543 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift)
544 + (qp->sq.wqe_cnt << qp->sq.wqe_shift);
545
546 return 0;
547 }
548 */
549static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) {
550	int i;
551
552	qp->sqp_proxy_rcv = malloc(sizeof(struct mlx4_ib_buf) * qp->rq.wqe_cnt);
553	if (!qp->sqp_proxy_rcv)
554		return -ENOMEM;
555	for (i = 0; i < qp->rq.wqe_cnt; i++) {
556		qp->sqp_proxy_rcv[i].addr = dma_alloc(
557				sizeof(struct mlx4_ib_proxy_sqp_hdr),
558				&qp->sqp_proxy_rcv[i].map);
559
560		/*qp->sqp_proxy_rcv[i].addr = malloc(
561		 sizeof(struct mlx4_ib_proxy_sqp_hdr));
562		 if (!qp->sqp_proxy_rcv[i].addr)
563		 goto err;
564		 qp->sqp_proxy_rcv[i].map = ib_dma_map_single(dev,
565		 qp->sqp_proxy_rcv[i].addr, sizeof(struct mlx4_ib_proxy_sqp_hdr),
566		 DMA_FROM_DEVICE);*/
567	}
568	return 0;
569
570	/*TODO: cleanup*/
571	/*err:*//*while (i > 0) {
572	 --i;
573	 ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
574	 sizeof(struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE);
575	 free(qp->sqp_proxy_rcv[i].addr);
576	 }*/
577	/*free(qp->sqp_proxy_rcv);
578	 qp->sqp_proxy_rcv = NULL;
579	 return -ENOMEM;*/
580}
581/*
582 static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp) {
583 int i;
584
585 for (i = 0; i < qp->rq.wqe_cnt; i++) {
586 ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
587 sizeof(struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE);
588 free(qp->sqp_proxy_rcv[i].addr);
589 }
590 free(qp->sqp_proxy_rcv);
591 }
592 */
593static int qp_has_rq(struct ib_qp_init_attr *attr) {
594	if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
595		return 0;
596
597	return !attr->srq;
598}
599/*
600 #ifdef __linux__
601 static int init_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp,
602 struct ib_qp_init_attr *attr, int *qpn) {
603 struct mlx4_ib_qpg_data *qpg_data;
604 int tss_num, rss_num;
605 int tss_align_num, rss_align_num;
606 int tss_base, rss_base = 0;
607 int err;
608
609 Parent is part of the TSS range (in SW TSS ARP is sent via parent)
610 tss_num = 1 + attr->parent_attrib.tss_child_count;
611 tss_align_num = roundup_pow_of_two(tss_num);
612 rss_num = attr->parent_attrib.rss_child_count;
613 rss_align_num = roundup_pow_of_two(rss_num);
614
615 if (rss_num > 1) {
616 RSS is requested
617 if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS))
618 return -ENOSYS;
619 if (rss_align_num > dev->dev->caps.max_rss_tbl_sz)
620 return -EINVAL;
621 We must work with power of two
622 attr->parent_attrib.rss_child_count = rss_align_num;
623 }
624
625 qpg_data = calloc(1,sizeof *qpg_data);
626 if (!qpg_data)
627 return -ENOMEM;
628
629 if (pqp->flags & MLX4_IB_QP_NETIF)
630 err = mlx4_ib_steer_qp_alloc(dev, tss_align_num, &tss_base);
631 else
632 err = mlx4_qp_reserve_range(dev->dev, tss_align_num, tss_align_num, &tss_base,
633 1);
634 if (err)
635 goto err1;
636
637 if (tss_num > 1) {
638 u32 alloc = BITS_TO_LONGS(tss_align_num) * sizeof(long);
639 qpg_data->tss_bitmap = calloc(1,alloc);
640 if (qpg_data->tss_bitmap == NULL) {
641 err = -ENOMEM;
642 goto err2;
643 }
644 bitmap_fill(qpg_data->tss_bitmap, tss_num);
645 Note parent takes first index
646 clear_bit(0, qpg_data->tss_bitmap);
647 }
648
649 if (rss_num > 1) {
650 u32 alloc = BITS_TO_LONGS(rss_align_num) * sizeof(long);
651 err = mlx4_qp_reserve_range(dev->dev, rss_align_num, 1, &rss_base, 0);
652 if (err)
653 goto err3;
654 qpg_data->rss_bitmap = calloc(1,alloc);
655 if (qpg_data->rss_bitmap == NULL) {
656 err = -ENOMEM;
657 goto err4;
658 }
659 bitmap_fill(qpg_data->rss_bitmap, rss_align_num);
660 }
661
662 qpg_data->tss_child_count = attr->parent_attrib.tss_child_count;
663 qpg_data->rss_child_count = attr->parent_attrib.rss_child_count;
664 qpg_data->qpg_parent = pqp;
665 qpg_data->qpg_tss_mask_sz = ilog2(tss_align_num);
666 qpg_data->tss_qpn_base = tss_base;
667 qpg_data->rss_qpn_base = rss_base;
668
669 pqp->qpg_data = qpg_data;
670 *qpn = tss_base;
671
672 return 0;
673
674 err4: mlx4_qp_release_range(dev->dev, rss_base, rss_align_num);
675
676 err3:
677 if (tss_num > 1)
678 free(qpg_data->tss_bitmap);
679
680 err2: if (pqp->flags & MLX4_IB_QP_NETIF)
681 mlx4_ib_steer_qp_free(dev, tss_base, tss_align_num);
682 else
683 mlx4_qp_release_range(dev->dev, tss_base, tss_align_num);
684
685 err1:
686 free(qpg_data);
687 return err;
688 }
689
690 static void free_qpg_parent(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *pqp) {
691 struct mlx4_ib_qpg_data *qpg_data = pqp->qpg_data;
692 int align_num;
693
694 if (qpg_data->tss_child_count > 1)
695 free(qpg_data->tss_bitmap);
696
697 align_num = roundup_pow_of_two(1 + qpg_data->tss_child_count);
698 if (pqp->flags & MLX4_IB_QP_NETIF)
699 mlx4_ib_steer_qp_free(dev, qpg_data->tss_qpn_base, align_num);
700 else
701 mlx4_qp_release_range(dev->dev, qpg_data->tss_qpn_base, align_num);
702
703 if (qpg_data->rss_child_count > 1) {
704 free(qpg_data->rss_bitmap);
705 align_num = roundup_pow_of_two(qpg_data->rss_child_count);
706 mlx4_qp_release_range(dev->dev, qpg_data->rss_qpn_base, align_num);
707 }
708
709 free(qpg_data);
710 }
711
712 static int alloc_qpg_qpn(struct ib_qp_init_attr *init_attr,
713 struct mlx4_ib_qp *pqp, int *qpn) {
714 struct mlx4_ib_qp *mqp = to_mqp(init_attr->qpg_parent);
715 struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data;
716 u32 idx, old;
717
718 switch (init_attr->qpg_type) {
719 case IB_QPG_CHILD_TX:
720 if (qpg_data->tss_child_count == 0)
721 return -EINVAL;
722 do {
723 Parent took index 0
724 idx = find_first_bit(qpg_data->tss_bitmap, qpg_data->tss_child_count + 1);
725 if (idx >= qpg_data->tss_child_count + 1)
726 return -ENOMEM;
727 old = test_and_clear_bit(idx, qpg_data->tss_bitmap);
728 } while (old == 0);
729 idx += qpg_data->tss_qpn_base;
730 break;
731 case IB_QPG_CHILD_RX:
732 if (qpg_data->rss_child_count == 0)
733 return -EINVAL;
734 do {
735 idx = find_first_bit(qpg_data->rss_bitmap, qpg_data->rss_child_count);
736 if (idx >= qpg_data->rss_child_count)
737 return -ENOMEM;
738 old = test_and_clear_bit(idx, qpg_data->rss_bitmap);
739 } while (old == 0);
740 idx += qpg_data->rss_qpn_base;
741 break;
742 default:
743 return -EINVAL;
744 }
745
746 pqp->qpg_data = qpg_data;
747 *qpn = idx;
748
749 return 0;
750 }
751
752 static void free_qpg_qpn(struct mlx4_ib_qp *mqp, int qpn) {
753 struct mlx4_ib_qpg_data *qpg_data = mqp->qpg_data;
754
755 switch (mqp->qpg_type) {
756 case IB_QPG_CHILD_TX:
757 Do range check
758 qpn -= qpg_data->tss_qpn_base;
759 set_bit(qpn, qpg_data->tss_bitmap);
760 break;
761 case IB_QPG_CHILD_RX:
762 qpn -= qpg_data->rss_qpn_base;
763 set_bit(qpn, qpg_data->rss_bitmap);
764 break;
765 default:
766 error
767 MLX4_WARN("wrong qpg type (%d)\n", mqp->qpg_type);
768 break;
769 }
770 }
771 #endif
772 */
773static int alloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
774		struct ib_qp_init_attr *attr, int *qpn) {
775	int err = 0;
776
777	switch (attr->qpg_type) {
778	case IB_QPG_NONE:
779		/*Raw packet QPNs must be aligned to 8 bits. If not, the WQE
780		 * BlueFlame setup flow wrongly causes VLAN insertion.*/
781		if (attr->qp_type == IB_QPT_RAW_PACKET) {
782			err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 1);
783		} else {
784			if (qp->flags & MLX4_IB_QP_NETIF)
785				err = mlx4_ib_steer_qp_alloc(dev, 1, qpn);
786			else
787				err = mlx4_qp_reserve_range(dev->dev, 1, 1, qpn, 0);
788		}
789		break;
790	case IB_QPG_PARENT:
791#ifdef __linux__
792		err = init_qpg_parent(dev, qp, attr, qpn);
793#endif
794		break;
795	case IB_QPG_CHILD_TX:
796	case IB_QPG_CHILD_RX:
797#ifdef __linux__
798		err = alloc_qpg_qpn(attr, qp, qpn);
799#endif
800		break;
801	default:
802		qp->qpg_type = IB_QPG_NONE;
803		err = -EINVAL;
804		break;
805	}
806	if (err)
807		return err;
808	qp->qpg_type = attr->qpg_type;
809	return 0;
810}
811/*
812 static void free_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
813 enum ib_qpg_type qpg_type, int qpn) {
814 switch (qpg_type) {
815 case IB_QPG_NONE:
816 if (qp->flags & MLX4_IB_QP_NETIF)
817 mlx4_ib_steer_qp_free(dev, qpn, 1);
818 else
819 mlx4_qp_release_range(dev->dev, qpn, 1);
820 break;
821 case IB_QPG_PARENT:
822 #ifdef __linux__
823 free_qpg_parent(dev, qp);
824 #endif
825 break;
826 case IB_QPG_CHILD_TX:
827 case IB_QPG_CHILD_RX:
828 #ifdef __linux__
829 free_qpg_qpn(qp, qpn);
830 #endif
831 break;
832 default:
833 break;
834 }
835 }
836
837 Revert allocation on create_qp_common
838 static void unalloc_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
839 struct ib_qp_init_attr *attr, int qpn) {
840 free_qpn_common(dev, qp, attr->qpg_type, qpn);
841 }
842
843 static void release_qpn_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) {
844 free_qpn_common(dev, qp, qp->qpg_type, qp->mqp.qpn);
845 }
846 */
847static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
848		struct ib_qp_init_attr *init_attr, struct ib_udata *udata, int sqpn,
849		struct mlx4_ib_qp **caller_qp) {
850	int qpn;
851	int err;
852	struct mlx4_ib_sqp *sqp;
853	struct mlx4_ib_qp *qp;
854	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
855
856#ifndef __linux__
857	init_attr->qpg_type = IB_QPG_NONE;
858#endif
859
860	/*When tunneling special qps, we use a plain UD qp*/
861	if (sqpn) {
862		if (mlx4_is_mfunc(dev->dev)
863				&& (!mlx4_is_master(dev->dev)
864						|| !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
865			if (init_attr->qp_type == IB_QPT_GSI)
866				qp_type = MLX4_IB_QPT_PROXY_GSI;
867			else if (mlx4_is_master(dev->dev))
868				qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
869			else
870				qp_type = MLX4_IB_QPT_PROXY_SMI;
871		}
872		qpn = sqpn;
873		/*add extra sg entry for tunneling*/
874		init_attr->cap.max_recv_sge++;
875	} else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
876		struct mlx4_ib_qp_tunnel_init_attr *tnl_init = container_of(init_attr,
877				struct mlx4_ib_qp_tunnel_init_attr, init_attr);
878		if ((tnl_init->proxy_qp_type != IB_QPT_SMI
879				&& tnl_init->proxy_qp_type != IB_QPT_GSI)
880				|| !mlx4_is_master(dev->dev))
881			return -EINVAL;
882		if (tnl_init->proxy_qp_type == IB_QPT_GSI)
883			qp_type = MLX4_IB_QPT_TUN_GSI;
884		else if (tnl_init->slave == mlx4_master_func_num(dev->dev))
885			qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
886		else
887			qp_type = MLX4_IB_QPT_TUN_SMI;
888		/*we are definitely in the PPF here, since we are creating
889		 * tunnel QPs. base_tunnel_sqpn is therefore valid.*/
890		qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
891				+ tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
892		sqpn = qpn;
893	}
894
895	if (!*caller_qp) {
896		if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI
897				|| (qp_type
898						& (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER
899								| MLX4_IB_QPT_PROXY_GSI
900								| MLX4_IB_QPT_TUN_SMI_OWNER))) {
901			sqp = calloc(1, sizeof(struct mlx4_ib_sqp));
902			if (!sqp)
903				return -ENOMEM;
904			qp = &sqp->qp;
905			qp->pri.vid = qp->alt.vid = 0xFFFF;
906		} else {
907			qp = calloc(1, sizeof(struct mlx4_ib_qp));
908			if (!qp)
909				return -ENOMEM;
910			qp->pri.vid = qp->alt.vid = 0xFFFF;
911		}
912	} else
913		qp = *caller_qp;
914
915	qp->mlx4_ib_qp_type = qp_type;
916
917	/*mutex_init(&qp->mutex);
918	 spin_lock_init(&qp->sq.lock);
919	 spin_lock_init(&qp->rq.lock);*/
920	INIT_LIST_HEAD(&qp->gid_list);
921	INIT_LIST_HEAD(&qp->steering_rules);
922	INIT_LIST_HEAD(&qp->rules_list);
923
924	qp->state = IB_QPS_RESET;
925	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
926		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
927
928	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr),
929			qp);
930	if (err)
931		goto err;
932
933	if (pd->uobject) {
934		assert(!"NYI");
935		/*struct mlx4_ib_create_qp ucmd;
936		 int shift;
937		 int n;
938
939		 if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
940		 err = -EFAULT;
941		 goto err;
942		 }
943
944		 qp->sq_no_prefetch = ucmd.sq_no_prefetch;
945
946		 err = set_user_sq_size(dev, qp, &ucmd);
947		 if (err)
948		 goto err;
949
950		 qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
951		 qp->buf_size, 0, 0);
952		 if (IS_ERR(qp->umem)) {
953		 err = PTR_ERR(qp->umem);
954		 goto err;
955		 }
956
957		 n = ib_umem_page_count(qp->umem);
958		 shift = mlx4_ib_umem_calc_optimal_mtt_size(qp->umem, 0, &n);
959		 err = mlx4_mtt_init(dev->dev, n, shift, &qp->mtt);
960
961		 if (err)
962		 goto err_buf;
963
964		 err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
965		 if (err)
966		 goto err_mtt;
967
968		 if (qp_has_rq(init_attr)) {
969		 err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
970		 ucmd.db_addr, &qp->db);
971		 if (err)
972		 goto err_mtt;
973		 }*/
974	} else {
975		qp->sq_no_prefetch = 0;
976
977		if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
978			qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
979
980		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
981			qp->flags |= MLX4_IB_QP_LSO;
982
983		if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP
984				&& dev->dev->caps.steering_mode
985						== MLX4_STEERING_MODE_DEVICE_MANAGED
986				&& !mlx4_is_mfunc(dev->dev))
987			qp->flags |= MLX4_IB_QP_NETIF;
988
989		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
990		if (err)
991			goto err;
992
993		if (qp_has_rq(init_attr)) {
994			err = mlx4_db_alloc(dev->dev, &qp->db, 0);
995			if (err)
996				goto err;
997
998			*qp->db.db = 0;
999		}
1000
1001		if (qp->max_inline_data) {
1002			err = mlx4_bf_alloc(dev->dev, &qp->bf, 0);
1003			if (err) {
1004				MLX4_DEBUG("failed to allocate blue flame"
1005						" register (%d)", err);
1006				qp->bf.uar = &dev->priv_uar;
1007			}
1008		} else
1009			qp->bf.uar = &dev->priv_uar;
1010
1011		if (mlx4_buf_alloc(dev->dev, qp->buf_size, BASE_PAGE_SIZE * 2,
1012				&qp->buf)) {
1013			err = -ENOMEM;
1014			goto err_db;
1015		}
1016
1017		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
1018				&qp->mtt);
1019		if (err)
1020			goto err_buf;
1021
1022		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf);
1023		if (err)
1024			goto err_mtt;
1025
1026		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(u64));
1027		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(u64));
1028
1029		if (!qp->sq.wrid || !qp->rq.wrid) {
1030			err = -ENOMEM;
1031			goto err_wrid;
1032		}
1033	}
1034
1035	if (sqpn) {
1036		if (qp->mlx4_ib_qp_type
1037				& (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI
1038						| MLX4_IB_QPT_PROXY_GSI)) {
1039			if (alloc_proxy_bufs(pd->device, qp)) {
1040				err = -ENOMEM;
1041				goto err_wrid;
1042			}
1043		}
1044	} else {
1045		err = alloc_qpn_common(dev, qp, init_attr, &qpn);
1046		if (err)
1047			goto err_proxy;
1048	}
1049
1050	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp);
1051	if (err)
1052		goto err_qpn;
1053
1054	if (init_attr->qp_type == IB_QPT_XRC_TGT)
1055		qp->mqp.qpn |= (1 << 23);
1056
1057	/** Hardware wants QPN written in big-endian order (after
1058	 * shifting) for send doorbell.  Precompute this value to save
1059	 * a little bit when posting sends.*/
1060
1061	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
1062
1063	qp->mqp.event = mlx4_ib_qp_event;
1064	if (!*caller_qp)
1065		*caller_qp = qp;
1066	return 0;
1067
1068	err_qpn: /*unalloc_qpn_common(dev, qp, init_attr, qpn);*/
1069
1070	err_proxy: /*if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
1071	 free_proxy_bufs(pd->device, qp);*/
1072	err_wrid: /*if (pd->uobject) {
1073	 if (qp_has_rq(init_attr))
1074	 mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
1075	 } else {*/
1076	free(qp->sq.wrid);
1077	free(qp->rq.wrid);
1078	/*}*/
1079
1080	err_mtt: /*mlx4_mtt_cleanup(dev->dev, &qp->mtt);*/
1081
1082	err_buf: /*if (pd->uobject)
1083	 ib_umem_release(qp->umem);
1084	 else
1085	 mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);*/
1086
1087	err_db: /*if (!pd->uobject && qp_has_rq(init_attr))
1088	 mlx4_db_free(dev->dev, &qp->db);
1089
1090	 if (qp->max_inline_data)
1091	 mlx4_bf_free(dev->dev, &qp->bf);*/
1092
1093	err: if (!*caller_qp)
1094		free(qp);
1095	return err;
1096}
1097
1098static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state) {
1099	switch (state) {
1100	case IB_QPS_RESET:
1101		return MLX4_QP_STATE_RST;
1102	case IB_QPS_INIT:
1103		return MLX4_QP_STATE_INIT;
1104	case IB_QPS_RTR:
1105		return MLX4_QP_STATE_RTR;
1106	case IB_QPS_RTS:
1107		return MLX4_QP_STATE_RTS;
1108	case IB_QPS_SQD:
1109		return MLX4_QP_STATE_SQD;
1110	case IB_QPS_SQE:
1111		return MLX4_QP_STATE_SQER;
1112	case IB_QPS_ERR:
1113		return MLX4_QP_STATE_ERR;
1114	default:
1115		return -1;
1116	}
1117}
1118/*
1119 static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
1120 struct mlx4_ib_cq *recv_cq)
1121 __acquires(&send_cq->lock) __acquires(&recv_cq->lock)
1122 {
1123 if (send_cq == recv_cq) {
1124 spin_lock_irq(&send_cq->lock);
1125 (void) __acquire(&recv_cq->lock);
1126 } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
1127 spin_lock_irq(&send_cq->lock);
1128 spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
1129 } else {
1130 spin_lock_irq(&recv_cq->lock);
1131 spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
1132 }
1133 }
1134
1135 static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
1136 struct mlx4_ib_cq *recv_cq)
1137 __releases(&send_cq->lock) __releases(&recv_cq->lock)
1138 {
1139 if (send_cq == recv_cq) {
1140 (void) __release(&recv_cq->lock);
1141 spin_unlock_irq(&send_cq->lock);
1142 } else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
1143 spin_unlock(&recv_cq->lock);
1144 spin_unlock_irq(&send_cq->lock);
1145 } else {
1146 spin_unlock(&send_cq->lock);
1147 spin_unlock_irq(&recv_cq->lock);
1148 }
1149 }
1150
1151 static void del_gid_entries(struct mlx4_ib_qp *qp) {
1152 struct mlx4_ib_gid_entry *ge, *tmp;
1153
1154 list_for_each_entry_safe(ge, tmp, &qp->gid_list, list)
1155 {
1156 list_del(&ge->list);
1157 free(ge);
1158 }
1159 }
1160 */
1161static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp) {
1162	if (qp->ibqp.qp_type == IB_QPT_XRC_TGT)
1163		return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd);
1164	else
1165		return to_mpd(qp->ibqp.pd);
1166}
1167
1168static void get_cqs(struct mlx4_ib_qp *qp, struct mlx4_ib_cq **send_cq,
1169		struct mlx4_ib_cq **recv_cq) {
1170	switch (qp->ibqp.qp_type) {
1171	case IB_QPT_XRC_TGT:
1172		*send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq);
1173		*recv_cq = *send_cq;
1174		break;
1175	case IB_QPT_XRC_INI:
1176		*send_cq = to_mcq(qp->ibqp.send_cq);
1177		*recv_cq = *send_cq;
1178		break;
1179	default:
1180		*send_cq = to_mcq(qp->ibqp.send_cq);
1181		*recv_cq = to_mcq(qp->ibqp.recv_cq);
1182		break;
1183	}
1184}
1185/*
1186 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
1187 int is_user) {
1188 struct mlx4_ib_cq *send_cq, *recv_cq;
1189
1190 if (qp->state != IB_QPS_RESET) {
1191 if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state), MLX4_QP_STATE_RST,
1192 NULL, 0, 0, &qp->mqp))
1193 MLX4_WARN("modify QP %06x to RESET failed.\n", qp->mqp.qpn);
1194 if (qp->pri.smac) {
1195 mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
1196 qp->pri.smac = 0;
1197 }
1198 if (qp->alt.smac) {
1199 mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
1200 qp->alt.smac = 0;
1201 }
1202 if (qp->pri.vid < 0x1000) {
1203 mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
1204 qp->pri.vid = 0xFFFF;
1205 qp->pri.candidate_vid = 0xFFFF;
1206 qp->pri.update_vid = 0;
1207 }
1208 if (qp->alt.vid < 0x1000) {
1209 mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
1210 qp->alt.vid = 0xFFFF;
1211 qp->alt.candidate_vid = 0xFFFF;
1212 qp->alt.update_vid = 0;
1213 }
1214 }
1215
1216 get_cqs(qp, &send_cq, &recv_cq);
1217
1218 mlx4_ib_lock_cqs(send_cq, recv_cq);
1219
1220 if (!is_user) {
1221 __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
1222 qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
1223 if (send_cq != recv_cq)
1224 __mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
1225 }
1226
1227 mlx4_qp_remove(dev->dev, &qp->mqp);
1228
1229 mlx4_ib_unlock_cqs(send_cq, recv_cq);
1230
1231 mlx4_qp_free(dev->dev, &qp->mqp);
1232
1233 if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp))
1234 release_qpn_common(dev, qp);
1235
1236 mlx4_mtt_cleanup(dev->dev, &qp->mtt);
1237
1238 if (is_user) {
1239 if (qp->rq.wqe_cnt)
1240 mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context), &qp->db);
1241 ib_umem_release(qp->umem);
1242 } else {
1243 free(qp->sq.wrid);
1244 free(qp->rq.wrid);
1245 if (qp->mlx4_ib_qp_type
1246 & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
1247 free_proxy_bufs(&dev->ib_dev, qp);
1248 mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
1249 if (qp->max_inline_data)
1250 mlx4_bf_free(dev->dev, &qp->bf);
1251
1252 if (qp->rq.wqe_cnt)
1253 mlx4_db_free(dev->dev, &qp->db);
1254 }
1255
1256 del_gid_entries(qp);
1257 }
1258 */
1259static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) {
1260	/*Native or PPF*/
1261	if (!mlx4_is_mfunc(dev->dev)
1262			|| (mlx4_is_master(dev->dev)
1263					&& attr->create_flags & MLX4_IB_SRIOV_SQP)) {
1264		return dev->dev->phys_caps.base_sqpn
1265				+ (attr->qp_type == IB_QPT_SMI ? 0 : 2) + attr->port_num - 1;
1266	}
1267	/*PF or VF -- creating proxies*/
1268	if (attr->qp_type == IB_QPT_SMI)
1269		return dev->dev->caps.qp0_proxy[attr->port_num - 1];
1270	else
1271		return dev->dev->caps.qp1_proxy[attr->port_num - 1];
1272}
1273/*
1274 #ifdef __linux__
1275 static int check_qpg_attr(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) {
1276 if (attr->qpg_type == IB_QPG_NONE)
1277 return 0;
1278
1279 if (attr->qp_type != IB_QPT_UD)
1280 return -EINVAL;
1281
1282 if (attr->qpg_type == IB_QPG_PARENT) {
1283 if (attr->parent_attrib.tss_child_count == 1)
1284 return -EINVAL;  Doesn't make sense
1285 if (attr->parent_attrib.rss_child_count == 1)
1286 return -EINVAL;  Doesn't make sense
1287 if ((attr->parent_attrib.tss_child_count == 0)
1288 && (attr->parent_attrib.rss_child_count == 0))
1289 Should be called with IP_QPG_NONE
1290 return -EINVAL;
1291 if (attr->parent_attrib.rss_child_count > 1) {
1292 int rss_align_num;
1293 if (!(dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS))
1294 return -ENOSYS;
1295 rss_align_num = roundup_pow_of_two(attr->parent_attrib.rss_child_count);
1296 if (rss_align_num > dev->dev->caps.max_rss_tbl_sz)
1297 return -EINVAL;
1298 }
1299 } else {
1300 struct mlx4_ib_qpg_data *qpg_data;
1301 if (attr->qpg_parent == NULL)
1302 return -EINVAL;
1303 if (IS_ERR(attr->qpg_parent))
1304 return -EINVAL;
1305 qpg_data = to_mqp(attr->qpg_parent)->qpg_data;
1306 if (qpg_data == NULL)
1307 return -EINVAL;
1308 if (attr->qpg_type == IB_QPG_CHILD_TX && !qpg_data->tss_child_count)
1309 return -EINVAL;
1310 if (attr->qpg_type == IB_QPG_CHILD_RX && !qpg_data->rss_child_count)
1311 return -EINVAL;
1312 }
1313 return 0;
1314 }
1315 #endif
1316 */
1317#define RESERVED_FLAGS_MASK ((((unsigned int)IB_QP_CREATE_RESERVED_END - 1) | IB_QP_CREATE_RESERVED_END)   \
1318							& ~(IB_QP_CREATE_RESERVED_START - 1))
1319
1320static enum mlx4_ib_qp_flags to_mlx4_ib_qp_flags(
1321		enum ib_qp_create_flags ib_qp_flags) {
1322	enum mlx4_ib_qp_flags mlx4_ib_qp_flags = 0;
1323
1324	if (ib_qp_flags & IB_QP_CREATE_IPOIB_UD_LSO)
1325		mlx4_ib_qp_flags |= MLX4_IB_QP_LSO;
1326
1327	if (ib_qp_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
1328		mlx4_ib_qp_flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
1329
1330	if (ib_qp_flags & IB_QP_CREATE_NETIF_QP)
1331		mlx4_ib_qp_flags |= MLX4_IB_QP_NETIF;
1332
1333	/* reserved flags*/
1334	mlx4_ib_qp_flags |= (ib_qp_flags & RESERVED_FLAGS_MASK);
1335
1336	return mlx4_ib_qp_flags;
1337}
1338
1339struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
1340		struct ib_qp_init_attr *init_attr, struct ib_udata *udata) {
1341	struct mlx4_ib_qp *qp = NULL;
1342	int err;
1343	u16 xrcdn = 0;
1344	enum mlx4_ib_qp_flags mlx4_qp_flags = to_mlx4_ib_qp_flags(
1345			init_attr->create_flags);
1346	struct ib_device *device;
1347
1348	/*see ib_core
1349	 ::ib_create_qp same handling*/
1350	device = pd ? pd->device : init_attr->xrcd->device;
1351
1352	/**We
1353	 only support
1354	 LSO, vendor
1355	 flag1, and
1356	 multicast loopback
1357	 blocking, *and
1358	 only
1359	 for kernel UD QPs.*/
1360
1361	if (mlx4_qp_flags
1362			& ~(MLX4_IB_QP_LSO | MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK
1363					| MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP
1364					| MLX4_IB_QP_NETIF))
1365		return ERR_PTR(-EINVAL);
1366
1367	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
1368		if (init_attr->qp_type != IB_QPT_UD)
1369			return ERR_PTR(-EINVAL);
1370	}
1371
1372	if (init_attr->create_flags
1373			&& (udata
1374					|| ((mlx4_qp_flags & ~MLX4_IB_SRIOV_SQP)
1375							&& init_attr->qp_type != IB_QPT_UD)
1376					|| ((mlx4_qp_flags & MLX4_IB_SRIOV_SQP)
1377							&& init_attr->qp_type > IB_QPT_GSI)))
1378		return ERR_PTR(-EINVAL);
1379
1380#ifdef __linux__
1381	assert(!"NYI");
1382	/*err = check_qpg_attr(to_mdev(device), init_attr);
1383	 if (err)
1384	 return ERR_PTR(err);*/
1385#endif
1386
1387	switch (init_attr->qp_type) {
1388	case IB_QPT_XRC_TGT:
1389		assert(!"NYI");
1390		/*pd = to_mxrcd(init_attr->xrcd)->pd;
1391		 xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
1392		 init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;*/
1393		/*fall through*/
1394	case IB_QPT_XRC_INI:
1395		assert(!"NYI");
1396		/*if (!(to_mdev(device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
1397		 return ERR_PTR(-ENOSYS);
1398		 init_attr->recv_cq = init_attr->send_cq;*/
1399		/*fall through*/
1400	case IB_QPT_RC:
1401	case IB_QPT_UC:
1402	case IB_QPT_RAW_PACKET:
1403		qp = calloc(1, sizeof *qp);
1404		if (!qp)
1405			return ERR_PTR(-ENOMEM);
1406		qp->pri.vid = qp->alt.vid = 0xFFFF;
1407		/*fall through*/
1408	case IB_QPT_UD: {
1409		err = create_qp_common(to_mdev(device), pd, init_attr, udata, 0, &qp);
1410		if (err) {
1411			free(qp);
1412			return ERR_PTR(err);
1413		}
1414
1415		qp->ibqp.qp_num = qp->mqp.qpn;
1416		qp->xrcdn = xrcdn;
1417
1418		break;
1419	}
1420	case IB_QPT_SMI:
1421	case IB_QPT_GSI: {
1422		/*Userspace is not allowed to create special QPs:*/
1423		if (udata)
1424			return ERR_PTR(-EINVAL);
1425
1426		err = create_qp_common(to_mdev(device), pd, init_attr, udata,
1427				get_sqp_num(to_mdev(device), init_attr), &qp);
1428		if (err)
1429			return ERR_PTR(err);
1430
1431		qp->port = init_attr->port_num;
1432		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
1433
1434		break;
1435	}
1436	default:
1437		/*Don't support raw QPs*/
1438		return ERR_PTR(-EINVAL);
1439	}
1440
1441	return &qp->ibqp;
1442}
1443/*
1444 int mlx4_ib_destroy_qp(struct ib_qp *qp) {
1445 struct mlx4_ib_dev *dev = to_mdev(qp->device);
1446 struct mlx4_ib_qp *mqp = to_mqp(qp);
1447 struct mlx4_ib_pd *pd;
1448
1449 if (is_qp0(dev, mqp))
1450 mlx4_CLOSE_PORT(dev->dev, mqp->port);
1451
1452 pd = get_pd(mqp);
1453 destroy_qp_common(dev, mqp, !!pd->ibpd.uobject);
1454
1455 if (is_sqp(dev, mqp))
1456 free(to_msqp(mqp));
1457 else
1458 free(mqp);
1459
1460 return 0;
1461 }
1462 */
1463static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) {
1464	switch (type) {
1465	case MLX4_IB_QPT_RC:
1466		return MLX4_QP_ST_RC;
1467	case MLX4_IB_QPT_UC:
1468		return MLX4_QP_ST_UC;
1469	case MLX4_IB_QPT_UD:
1470		return MLX4_QP_ST_UD;
1471	case MLX4_IB_QPT_XRC_INI:
1472	case MLX4_IB_QPT_XRC_TGT:
1473		return MLX4_QP_ST_XRC;
1474	case MLX4_IB_QPT_SMI:
1475	case MLX4_IB_QPT_GSI:
1476	case MLX4_IB_QPT_RAW_PACKET:
1477		return MLX4_QP_ST_MLX;
1478
1479	case MLX4_IB_QPT_PROXY_SMI_OWNER:
1480	case MLX4_IB_QPT_TUN_SMI_OWNER:
1481		return (mlx4_is_mfunc(dev->dev) ? MLX4_QP_ST_MLX : -1);
1482	case MLX4_IB_QPT_PROXY_SMI:
1483	case MLX4_IB_QPT_TUN_SMI:
1484	case MLX4_IB_QPT_PROXY_GSI:
1485	case MLX4_IB_QPT_TUN_GSI:
1486		return (mlx4_is_mfunc(dev->dev) ? MLX4_QP_ST_UD : -1);
1487	default:
1488		return -1;
1489	}
1490}
1491
1492static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp,
1493		const struct ib_qp_attr *attr, int attr_mask) {
1494	u8 dest_rd_atomic;
1495	u32 access_flags;
1496	u32 hw_access_flags = 0;
1497
1498	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1499		dest_rd_atomic = attr->max_dest_rd_atomic;
1500	else
1501		dest_rd_atomic = qp->resp_depth;
1502
1503	if (attr_mask & IB_QP_ACCESS_FLAGS)
1504		access_flags = attr->qp_access_flags;
1505	else
1506		access_flags = qp->atomic_rd_en;
1507
1508	if (!dest_rd_atomic)
1509		access_flags &= IB_ACCESS_REMOTE_WRITE;
1510
1511	if (access_flags & IB_ACCESS_REMOTE_READ)
1512		hw_access_flags |= MLX4_QP_BIT_RRE;
1513	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
1514		hw_access_flags |= MLX4_QP_BIT_RAE;
1515	if (access_flags & IB_ACCESS_REMOTE_WRITE)
1516		hw_access_flags |= MLX4_QP_BIT_RWE;
1517
1518	return cpu_to_be32(hw_access_flags);
1519}
1520
1521static void store_sqp_attrs(struct mlx4_ib_sqp *sqp,
1522		const struct ib_qp_attr *attr, int attr_mask) {
1523	if (attr_mask & IB_QP_PKEY_INDEX)
1524		sqp->pkey_index = attr->pkey_index;
1525	if (attr_mask & IB_QP_QKEY)
1526		sqp->qkey = attr->qkey;
1527	if (attr_mask & IB_QP_SQ_PSN)
1528		sqp->send_psn = attr->sq_psn;
1529}
1530
1531static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port) {
1532	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
1533}
1534
1535static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
1536		struct mlx4_ib_qp *qp, struct mlx4_qp_path *path, u8 port,
1537		int is_primary) {
1538	/*struct net_device *ndev;
1539	 int err;*/
1540	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port)
1541			== IB_LINK_LAYER_ETHERNET;
1542	/*u8 mac[6];
1543	 int is_mcast;
1544	 u16 vlan_tag;
1545	 int vidx;
1546	 int smac_index;
1547	 u64 u64_mac;
1548	 u8 *smac;
1549	 struct mlx4_roce_smac_vlan_info *smac_info;*/
1550
1551	path->grh_mylmc = ah->src_path_bits & 0x7f;
1552	path->rlid = cpu_to_be16(ah->dlid);
1553	if (ah->static_rate) {
1554		path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
1555		while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET
1556				&& !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
1557			--path->static_rate;
1558	} else
1559		path->static_rate = 0;
1560
1561	if (ah->ah_flags & IB_AH_GRH) {
1562		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
1563			MLX4_ERR("sgid_index (%u) too large. max is %d\n",
1564					ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
1565			return -1;
1566		}
1567
1568		path->grh_mylmc |= 1 << 7;
1569		path->mgid_index = ah->grh.sgid_index;
1570		path->hop_limit = ah->grh.hop_limit;
1571		path->tclass_flowlabel = cpu_to_be32(
1572				(ah->grh.traffic_class << 20) | (ah->grh.flow_label));
1573		memcpy(path->rgid, ah->grh.dgid.raw, 16);
1574	}
1575
1576	if (is_eth) {
1577		assert(!"NYI");
1578		/*if (!(ah->ah_flags & IB_AH_GRH))
1579		 return -1;
1580
1581		 path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6)
1582		 | ((ah->sl & 7) << 3);
1583
1584		 if (is_primary)
1585		 smac_info = &qp->pri;
1586		 else
1587		 smac_info = &qp->alt;
1588
1589		 vlan_tag = rdma_get_vlan_id(
1590		 &dev->iboe.gid_table[port - 1][ah->grh.sgid_index]);
1591		 if (vlan_tag < 0x1000) {
1592		 if (smac_info->vid < 0x1000) {
1593		 both valid vlan ids
1594		 if (smac_info->vid != vlan_tag) {
1595		 different VIDs.  unreg old and reg new
1596		 err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
1597		 if (err)
1598		 return err;
1599		 smac_info->candidate_vid = vlan_tag;
1600		 smac_info->candidate_vlan_index = vidx;
1601		 smac_info->candidate_vlan_port = port;
1602		 smac_info->update_vid = 1;
1603		 path->vlan_index = vidx;
1604		 path->fl = 1 << 6;
1605		 } else {
1606		 path->vlan_index = smac_info->vlan_index;
1607		 path->fl = 1 << 6;
1608		 }
1609		 } else {
1610		 no current vlan tag in qp
1611		 err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
1612		 if (err)
1613		 return err;
1614		 smac_info->candidate_vid = vlan_tag;
1615		 smac_info->candidate_vlan_index = vidx;
1616		 smac_info->candidate_vlan_port = port;
1617		 smac_info->update_vid = 1;
1618		 path->vlan_index = vidx;
1619		 path->fl = 1 << 6;
1620		 }
1621		 } else {
1622		 have current vlan tag. unregister it at modify-qp success
1623		 if (smac_info->vid < 0x1000) {
1624		 smac_info->candidate_vid = 0xFFFF;
1625		 smac_info->update_vid = 1;
1626		 }
1627		 }
1628
1629		 err = mlx4_ib_resolve_grh(dev, ah, mac, &is_mcast, port);
1630		 if (err)
1631		 return err;
1632
1633		 get smac_index for RoCE use.
1634		 * If no smac was yet assigned, register one.
1635		 * If one was already assigned, but the new mac differs,
1636		 * unregister the old one and register the new one.
1637
1638		 spin_lock(&dev->iboe.lock);
1639		 ndev = dev->iboe.netdevs[port - 1];
1640		 if (ndev) {
1641		 #ifdef __linux__
1642		 smac = ndev->dev_addr; fixme: cache this value
1643		 #else
1644		 smac = IF_LLADDR(ndev); fixme: cache this value
1645		 #endif
1646
1647		 u64_mac = mlx4_mac_to_u64(smac);
1648		 } else
1649		 u64_mac = dev->dev->caps.def_mac[port];
1650		 spin_unlock(&dev->iboe.lock);
1651
1652		 if (!smac_info->smac || smac_info->smac != u64_mac) {
1653		 register candidate now, unreg if needed, after success
1654		 smac_index = mlx4_register_mac(dev->dev, port, u64_mac);
1655		 if (smac_index >= 0) {
1656		 smac_info->candidate_smac_index = smac_index;
1657		 smac_info->candidate_smac = u64_mac;
1658		 smac_info->candidate_smac_port = port;
1659		 } else
1660		 return -EINVAL;
1661		 } else
1662		 smac_index = smac_info->smac_index;
1663
1664		 memcpy(path->dmac, mac, 6);
1665		 path->ackto = MLX4_IB_LINK_TYPE_ETH;
1666		 put MAC table smac index for IBoE
1667		 path->grh_mylmc = (u8)(smac_index) | 0x80;*/
1668
1669	} else
1670		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((port - 1) << 6)
1671				| ((ah->sl & 0xf) << 2);
1672
1673	return 0;
1674}
1675
1676static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) {
1677	struct mlx4_ib_gid_entry *ge, *tmp;
1678
1679	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list)
1680	{
1681		assert(!"NYI");
1682		/*if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
1683		 ge->added = 1;
1684		 ge->port = qp->port;
1685		 }*/
1686	}
1687}
1688/*
1689 static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev,
1690 struct mlx4_ib_qp *qp, struct mlx4_qp_context *context) {
1691 struct net_device *ndev;
1692 u64 u64_mac;
1693 u8 *smac;
1694 int smac_index;
1695
1696 ndev = dev->iboe.netdevs[qp->port - 1];
1697 if (ndev) {
1698 #ifdef __linux__
1699 smac = ndev->dev_addr;  fixme: cache this value
1700 #else
1701 smac = IF_LLADDR(ndev);  fixme: cache this value
1702 #endif
1703 u64_mac = mlx4_mac_to_u64(smac);
1704 } else
1705 u64_mac = dev->dev->caps.def_mac[qp->port];
1706
1707 context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE
1708 | ((qp->port - 1) << 6);
1709 if (!qp->pri.smac) {
1710 smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
1711 if (smac_index >= 0) {
1712 qp->pri.candidate_smac_index = smac_index;
1713 qp->pri.candidate_smac = u64_mac;
1714 qp->pri.candidate_smac_port = qp->port;
1715 context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
1716 } else
1717 return -ENOENT;
1718 }
1719 return 0;
1720 }
1721 */
1722static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
1723		const struct ib_qp_attr *attr, int attr_mask,
1724		enum ib_qp_state cur_state, enum ib_qp_state new_state) {
1725	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
1726	struct mlx4_ib_qp *qp = to_mqp(ibqp);
1727	struct mlx4_ib_pd *pd;
1728	struct mlx4_ib_cq *send_cq, *recv_cq;
1729	struct mlx4_qp_context *context;
1730	enum mlx4_qp_optpar optpar = 0;
1731	int sqd_event;
1732	int steer_qp = 0;
1733	int err = -EINVAL;
1734	int is_eth = -1;
1735
1736	context = calloc(1, sizeof *context);
1737	if (!context)
1738		return -ENOMEM;
1739
1740	context->flags = cpu_to_be32(
1741			(to_mlx4_state(new_state) << 28)
1742					| (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
1743
1744	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
1745		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
1746	else {
1747		optpar |= MLX4_QP_OPTPAR_PM_STATE;
1748		switch (attr->path_mig_state) {
1749		case IB_MIG_MIGRATED:
1750			context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
1751			break;
1752		case IB_MIG_REARM:
1753			context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
1754			break;
1755		case IB_MIG_ARMED:
1756			context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
1757			break;
1758		}
1759	}
1760
1761	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
1762		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
1763	else if (ibqp->qp_type == IB_QPT_RAW_PACKET)
1764		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
1765	else if (ibqp->qp_type == IB_QPT_UD) {
1766		if (qp->flags & MLX4_IB_QP_LSO)
1767			context->mtu_msgmax = (IB_MTU_4096 << 5)
1768					| ilog2(dev->dev->caps.max_gso_sz);
1769		else
1770			context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
1771	} else if (attr_mask & IB_QP_PATH_MTU) {
1772		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
1773			MLX4_ERR("path MTU (%u) is invalid\n", attr->path_mtu);
1774			goto out;
1775		}
1776		context->mtu_msgmax = (attr->path_mtu << 5)
1777				| ilog2(dev->dev->caps.max_msg_sz);
1778	}
1779
1780	if (qp->rq.wqe_cnt)
1781		context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
1782	context->rq_size_stride |= qp->rq.wqe_shift - 4;
1783
1784	if (qp->sq.wqe_cnt)
1785		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
1786	context->sq_size_stride |= qp->sq.wqe_shift - 4;
1787
1788	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
1789		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
1790		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
1791		context->param3 |= cpu_to_be32(1 << 30);
1792	}
1793
1794	if (qp->ibqp.uobject)
1795		context->usr_page = cpu_to_be32(
1796				to_mucontext(ibqp->uobject->context)->uar.index);
1797	else
1798		context->usr_page = cpu_to_be32(qp->bf.uar->index);
1799
1800	if (attr_mask & IB_QP_DEST_QPN)
1801		context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
1802
1803	if (attr_mask & IB_QP_PORT) {
1804		if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD
1805				&& !(attr_mask & IB_QP_AV)) {
1806			mlx4_set_sched(&context->pri_path, attr->port_num);
1807			optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
1808		}
1809	}
1810
1811	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
1812		if (dev->counters[qp->port - 1] != -1) {
1813			context->pri_path.counter_index = dev->counters[qp->port - 1];
1814			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
1815		} else
1816			context->pri_path.counter_index = 0xff;
1817
1818		if (qp->flags & MLX4_IB_QP_NETIF
1819				&& (qp->qpg_type == IB_QPG_NONE || qp->qpg_type == IB_QPG_PARENT)) {
1820			mlx4_ib_steer_qp_reg(dev, qp, 1);
1821			steer_qp = 1;
1822		}
1823	}
1824
1825	if (attr_mask & IB_QP_PKEY_INDEX) {
1826		if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
1827			context->pri_path.disable_pkey_check = 0x40;
1828		context->pri_path.pkey_index = attr->pkey_index;
1829		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
1830	}
1831
1832	if (attr_mask & IB_QP_AV) {
1833		if (mlx4_set_path(dev, &attr->ah_attr, qp, &context->pri_path,
1834				attr_mask & IB_QP_PORT ? attr->port_num : qp->port, 1))
1835			goto out;
1836
1837		optpar |=
1838				(MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE);
1839	}
1840
1841	if (attr_mask & IB_QP_TIMEOUT) {
1842		context->pri_path.ackto |= attr->timeout << 3;
1843		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
1844	}
1845
1846	if (attr_mask & IB_QP_ALT_PATH) {
1847		if (attr->alt_port_num == 0
1848				|| attr->alt_port_num > dev->dev->caps.num_ports)
1849			goto out;
1850
1851		if (attr->alt_pkey_index
1852				>= dev->dev->caps.pkey_table_len[attr->alt_port_num])
1853			goto out;
1854
1855		if (mlx4_set_path(dev, &attr->alt_ah_attr, qp, &context->alt_path,
1856				attr->alt_port_num, 0))
1857			goto out;
1858
1859		context->alt_path.pkey_index = attr->alt_pkey_index;
1860		context->alt_path.ackto = attr->alt_timeout << 3;
1861		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
1862	}
1863
1864	pd = get_pd(qp);
1865	get_cqs(qp, &send_cq, &recv_cq);
1866	context->pd = cpu_to_be32(pd->pdn);
1867	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
1868	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
1869	context->params1 = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
1870
1871	/*Set "fast registration enabled" for all kernel QPs*/
1872	if (!qp->ibqp.uobject)
1873		context->params1 |= cpu_to_be32(1 << 11);
1874
1875	if (attr_mask & IB_QP_RNR_RETRY) {
1876		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
1877		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
1878	}
1879
1880	if (attr_mask & IB_QP_RETRY_CNT) {
1881		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
1882		optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
1883	}
1884
1885	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
1886		if (attr->max_rd_atomic)
1887			context->params1 |= cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
1888		optpar |= MLX4_QP_OPTPAR_SRA_MAX;
1889	}
1890
1891	if (attr_mask & IB_QP_SQ_PSN)
1892		context->next_send_psn = cpu_to_be32(attr->sq_psn);
1893
1894	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
1895		if (attr->max_dest_rd_atomic)
1896			context->params2 |= cpu_to_be32(
1897					fls(attr->max_dest_rd_atomic - 1) << 21);
1898		optpar |= MLX4_QP_OPTPAR_RRA_MAX;
1899	}
1900
1901	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
1902		context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
1903		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
1904	}
1905
1906	if (attr_mask & IB_M_EXT_CLASS_1)
1907		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER);
1908
1909	/*for now we enable also sqe on send*/
1910	if (attr_mask & IB_M_EXT_CLASS_2) {
1911		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_SQ);
1912		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_MASTER);
1913	}
1914
1915	if (attr_mask & IB_M_EXT_CLASS_3)
1916		context->params2 |= cpu_to_be32(MLX4_QP_BIT_COLL_SYNC_RQ);
1917
1918	if (ibqp->srq)
1919		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
1920
1921	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
1922		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
1923		optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
1924	}
1925	if (attr_mask & IB_QP_RQ_PSN)
1926		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
1927
1928	/*proxy and tunnel qp qkeys will be changed in modify-qp wrappers*/
1929	if (attr_mask & IB_QP_QKEY) {
1930		if (qp->mlx4_ib_qp_type
1931				& (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
1932			context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
1933		else {
1934			if (mlx4_is_mfunc(
1935					dev->dev) && !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
1936					&& (attr->qkey & MLX4_RESERVED_QKEY_MASK)
1937					== MLX4_RESERVED_QKEY_BASE) {
1938				MLX4_ERR("Cannot use reserved QKEY"
1939						" 0x%x (range 0xffff0000..0xffffffff"
1940						" is reserved)\n", attr->qkey);
1941				err = -EINVAL;
1942				goto out;
1943			}
1944			context->qkey = cpu_to_be32(attr->qkey);
1945		}
1946		optpar |= MLX4_QP_OPTPAR_Q_KEY;
1947	}
1948
1949	if (ibqp->srq)
1950		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
1951
1952	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1953		context->db_rec_addr = cpu_to_be64(qp->db.dma);
1954
1955	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR
1956			&& (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI
1957					|| ibqp->qp_type == IB_QPT_UD
1958					|| ibqp->qp_type == IB_QPT_RAW_PACKET)) {
1959		context->pri_path.sched_queue = (qp->port - 1) << 6;
1960		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI
1961				|| qp->mlx4_ib_qp_type
1962						& (MLX4_IB_QPT_PROXY_SMI_OWNER
1963								| MLX4_IB_QPT_TUN_SMI_OWNER)) {
1964			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
1965			if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
1966				context->pri_path.fl = 0x80;
1967		} else {
1968			if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
1969				context->pri_path.fl = 0x80;
1970			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
1971		}
1972		is_eth = rdma_port_get_link_layer(&dev->ib_dev, qp->port)
1973				== IB_LINK_LAYER_ETHERNET;
1974		if (is_eth) {
1975			assert(!"NYI");
1976			/*if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI
1977			 || qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
1978			 context->pri_path.feup = 1 << 7; don't fsm
1979			 handle smac_index
1980			 if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD
1981			 || qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI
1982			 || qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
1983			 err = handle_eth_ud_smac_index(dev, qp, context);
1984			 if (err)
1985			 return -EINVAL;
1986			 }*/
1987		}
1988	}
1989
1990	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD
1991			&& attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY
1992			&& attr->en_sqd_async_notify)
1993		sqd_event = 1;
1994	else
1995		sqd_event = 0;
1996
1997	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1998		context->rlkey |= (1 << 4);
1999
2000	if ((attr_mask & IB_QP_GROUP_RSS) && (qp->qpg_data->rss_child_count > 1)) {
2001		struct mlx4_ib_qpg_data *qpg_data = qp->qpg_data;
2002		void *rss_context_base = &context->pri_path;
2003		struct mlx4_rss_context *rss_context =
2004				(struct mlx4_rss_context *) (rss_context_base
2005						+ MLX4_RSS_OFFSET_IN_QPC_PRI_PATH);
2006
2007		context->flags |= cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET);
2008
2009		/*This should be tbl_sz_base_qpn*/
2010		rss_context->base_qpn = cpu_to_be32(
2011				qpg_data->rss_qpn_base
2012						| (ilog2(qpg_data->rss_child_count) << 24));
2013		rss_context->default_qpn = cpu_to_be32(qpg_data->rss_qpn_base);
2014		/*This should be flags_hash_fn*/
2015		rss_context->flags = MLX4_RSS_TCP_IPV6 | MLX4_RSS_TCP_IPV4;
2016		if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UDP_RSS) {
2017			rss_context->base_qpn_udp = rss_context->default_qpn;
2018			rss_context->flags |= MLX4_RSS_IPV6 | MLX4_RSS_IPV4
2019					| MLX4_RSS_UDP_IPV6 | MLX4_RSS_UDP_IPV4;
2020		}
2021		if (dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_RSS_TOP) {
2022			static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC,
2023					0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD, 0x593D56D9,
2024					0xF3253C06, 0x2ADC1FFC };
2025			rss_context->hash_fn = MLX4_RSS_HASH_TOP;
2026			memcpy(rss_context->rss_key, rsskey, sizeof(rss_context->rss_key));
2027		} else {
2028			rss_context->hash_fn = MLX4_RSS_HASH_XOR;
2029			memset(rss_context->rss_key, 0, sizeof(rss_context->rss_key));
2030		}
2031	}
2032
2033	/** Before passing a kernel QP to the HW, make sure that the
2034	 * ownership bits of the send queue are set and the SQ
2035	 * headroom is stamped so that the hardware doesn't start
2036	 * processing stale work requests.*/
2037
2038	if (!ibqp->uobject && cur_state == IB_QPS_RESET
2039			&& new_state == IB_QPS_INIT) {
2040		struct mlx4_wqe_ctrl_seg *ctrl;
2041		int i;
2042
2043		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
2044			ctrl = get_send_wqe(qp, i);
2045			ctrl->owner_opcode = cpu_to_be32(1U << 31);
2046			if (qp->sq_max_wqes_per_wr == 1)
2047				ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
2048
2049			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
2050		}
2051	}
2052
2053	err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
2054			to_mlx4_state(new_state), context, optpar, sqd_event, &qp->mqp);
2055	if (err)
2056		goto out;
2057
2058	qp->state = new_state;
2059
2060	if (attr_mask & IB_QP_ACCESS_FLAGS)
2061		qp->atomic_rd_en = attr->qp_access_flags;
2062	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
2063		qp->resp_depth = attr->max_dest_rd_atomic;
2064	if (attr_mask & IB_QP_PORT) {
2065		qp->port = attr->port_num;
2066		update_mcg_macs(dev, qp);
2067	}
2068	if (attr_mask & IB_QP_ALT_PATH)
2069		qp->alt_port = attr->alt_port_num;
2070
2071	if (is_sqp(dev, qp))
2072		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
2073
2074	/*Set 'ignore_cq_overrun' bits for collectives offload*/
2075	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
2076		if (attr_mask & (IB_M_EXT_CLASS_2 | IB_M_EXT_CLASS_3)) {
2077			err = mlx4_ib_ignore_overrun_cq(ibqp->send_cq);
2078			if (err) {
2079				MLX4_ERR("Failed to set ignore CQ "
2080						"overrun for QP 0x%x's send CQ\n", ibqp->qp_num);
2081				goto out;
2082			}
2083
2084			if (ibqp->recv_cq != ibqp->send_cq) {
2085				err = mlx4_ib_ignore_overrun_cq(ibqp->recv_cq);
2086				if (err) {
2087					MLX4_ERR("Failed to set ignore "
2088							"CQ overrun for QP 0x%x's recv "
2089							"CQ\n", ibqp->qp_num);
2090					goto out;
2091				}
2092			}
2093		}
2094	}
2095
2096	/** If we moved QP0 to RTR, bring the IB link up; if we moved
2097	 * QP0 to RESET or ERROR, bring the link back down.*/
2098
2099	if (is_qp0(dev, qp)) {
2100		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
2101			if (mlx4_INIT_PORT(dev->dev, qp->port))
2102				MLX4_WARN("INIT_PORT failed for port %d\n", qp->port);
2103
2104		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR
2105				&& (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
2106			assert(!"NYI");
2107		/*mlx4_CLOSE_PORT(dev->dev, qp->port);*/
2108	}
2109
2110	/** If we moved a kernel QP to RESET, clean up all old CQ
2111	 * entries and reinitialize the QP.*/
2112
2113	if (new_state == IB_QPS_RESET) {
2114		assert(!"NYI");
2115		/*if (!ibqp->uobject) {
2116		 mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
2117		 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
2118		 if (send_cq != recv_cq)
2119		 mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
2120
2121		 qp->rq.head = 0;
2122		 qp->rq.tail = 0;
2123		 qp->sq.head = 0;
2124		 qp->sq.tail = 0;
2125		 qp->sq_next_wqe = 0;
2126		 if (qp->rq.wqe_cnt)
2127		 *qp->db.db = 0;
2128
2129		 if (qp->flags & MLX4_IB_QP_NETIF
2130		 && (qp->qpg_type == IB_QPG_NONE
2131		 || qp->qpg_type == IB_QPG_PARENT))
2132		 mlx4_ib_steer_qp_reg(dev, qp, 0);
2133		 }
2134		 if (qp->pri.smac) {
2135		 mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
2136		 qp->pri.smac = 0;
2137		 }
2138		 if (qp->alt.smac) {
2139		 mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
2140		 qp->alt.smac = 0;
2141		 }
2142		 if (qp->pri.vid < 0x1000) {
2143		 mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
2144		 qp->pri.vid = 0xFFFF;
2145		 qp->pri.candidate_vid = 0xFFFF;
2146		 qp->pri.update_vid = 0;
2147		 }
2148
2149		 if (qp->alt.vid < 0x1000) {
2150		 mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
2151		 qp->alt.vid = 0xFFFF;
2152		 qp->alt.candidate_vid = 0xFFFF;
2153		 qp->alt.update_vid = 0;
2154		 }*/
2155	}
2156
2157	/*TODO: cleanup*/
2158	out: /*if (err && steer_qp)
2159	 mlx4_ib_steer_qp_reg(dev, qp, 0);
2160	 free(context);
2161	 if (qp->pri.candidate_smac) {
2162	 if (err)
2163	 mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port,
2164	 qp->pri.candidate_smac);
2165	 else {
2166	 if (qp->pri.smac) {
2167	 mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
2168	 }
2169	 qp->pri.smac = qp->pri.candidate_smac;
2170	 qp->pri.smac_index = qp->pri.candidate_smac_index;
2171	 qp->pri.smac_port = qp->pri.candidate_smac_port;
2172
2173	 }
2174	 qp->pri.candidate_smac = 0;
2175	 qp->pri.candidate_smac_index = 0;
2176	 qp->pri.candidate_smac_port = 0;
2177	 }
2178	 if (qp->alt.candidate_smac) {
2179	 if (err)
2180	 mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port,
2181	 qp->pri.candidate_smac);
2182	 else {
2183	 if (qp->pri.smac) {
2184	 mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
2185	 }
2186	 qp->alt.smac = qp->alt.candidate_smac;
2187	 qp->alt.smac_index = qp->alt.candidate_smac_index;
2188	 qp->alt.smac_port = qp->alt.candidate_smac_port;
2189
2190	 }
2191	 qp->pri.candidate_smac = 0;
2192	 qp->pri.candidate_smac_index = 0;
2193	 qp->pri.candidate_smac_port = 0;
2194	 }
2195
2196	 if (qp->pri.update_vid) {
2197	 if (err) {
2198	 if (qp->pri.candidate_vid < 0x1000)
2199	 mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port,
2200	 qp->pri.candidate_vid);
2201	 } else {
2202	 if (qp->pri.vid < 0x1000)
2203	 mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
2204	 qp->pri.vid = qp->pri.candidate_vid;
2205	 qp->pri.vlan_port = qp->pri.candidate_vlan_port;
2206	 qp->pri.vlan_index = qp->pri.candidate_vlan_index;
2207	 }
2208	 qp->pri.candidate_vid = 0xFFFF;
2209	 qp->pri.update_vid = 0;
2210	 }
2211
2212	 if (qp->alt.update_vid) {
2213	 if (err) {
2214	 if (qp->alt.candidate_vid < 0x1000)
2215	 mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port,
2216	 qp->alt.candidate_vid);
2217	 } else {
2218	 if (qp->alt.vid < 0x1000)
2219	 mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
2220	 qp->alt.vid = qp->alt.candidate_vid;
2221	 qp->alt.vlan_port = qp->alt.candidate_vlan_port;
2222	 qp->alt.vlan_index = qp->alt.candidate_vlan_index;
2223	 }
2224	 qp->alt.candidate_vid = 0xFFFF;
2225	 qp->alt.update_vid = 0;
2226	 }*/
2227
2228	return err;
2229}
2230
2231int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
2232		int attr_mask, struct ib_udata *udata) {
2233	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
2234	struct mlx4_ib_qp *qp = to_mqp(ibqp);
2235	enum ib_qp_state cur_state, new_state;
2236	int err = -EINVAL;
2237
2238	/*mutex_lock(&qp->mutex);*/
2239
2240	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
2241	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
2242
2243	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
2244			attr_mask & ~IB_M_QP_MOD_VEND_MASK)) {
2245		MLX4_DEBUG("qpn 0x%x: invalid attribute mask specified "
2246				"for transition %d to %d. qp_type %d,"
2247				" attr_mask 0x%x\n", ibqp->qp_num, cur_state, new_state,
2248				ibqp->qp_type, attr_mask);
2249		goto out;
2250	}
2251
2252	if ((attr_mask & IB_M_QP_MOD_VEND_MASK) && !dev->dev->caps.sync_qp) {
2253		MLX4_ERR("extended verbs are not supported by %s\n", dev->ib_dev.name);
2254		goto out;
2255	}
2256
2257	if ((attr_mask & IB_QP_PORT)
2258			&& (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
2259		MLX4_DEBUG("qpn 0x%x: invalid port number (%d) specified "
2260				"for transition %d to %d. qp_type %d\n", ibqp->qp_num,
2261				attr->port_num, cur_state, new_state, ibqp->qp_type);
2262		goto out;
2263	}
2264
2265	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET)
2266			&& (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num)
2267					!= IB_LINK_LAYER_ETHERNET))
2268		goto out;
2269
2270	if (attr_mask & IB_QP_PKEY_INDEX) {
2271		int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
2272		if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
2273			MLX4_DEBUG("qpn 0x%x: invalid pkey index (%d) specified "
2274					"for transition %d to %d. qp_type %d\n", ibqp->qp_num,
2275					attr->pkey_index, cur_state, new_state, ibqp->qp_type);
2276			goto out;
2277		}
2278	}
2279
2280	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC
2281			&& attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
2282		MLX4_DEBUG("qpn 0x%x: max_rd_atomic (%d) too large. "
2283				"Transition %d to %d. qp_type %d\n", ibqp->qp_num,
2284				attr->max_rd_atomic, cur_state, new_state, ibqp->qp_type);
2285		goto out;
2286	}
2287
2288	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC
2289			&& attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
2290		MLX4_DEBUG("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
2291				"Transition %d to %d. qp_type %d\n", ibqp->qp_num,
2292				attr->max_dest_rd_atomic, cur_state, new_state, ibqp->qp_type);
2293		goto out;
2294	}
2295
2296	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
2297		err = 0;
2298		goto out;
2299	}
2300
2301	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
2302
2303	out: /*mutex_unlock(&qp->mutex);*/
2304	return err;
2305}
2306/*
2307 static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
2308 struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len) {
2309 struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
2310 struct ib_device *ib_dev = &mdev->ib_dev;
2311 struct mlx4_wqe_mlx_seg *mlx = wqe;
2312 struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
2313 struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
2314 u16 pkey;
2315 u32 qkey;
2316 int send_size;
2317 int header_size;
2318 int spc;
2319 int i;
2320
2321 if (wr->opcode != IB_WR_SEND)
2322 return -EINVAL;
2323
2324 send_size = 0;
2325
2326 for (i = 0; i < wr->num_sge; ++i)
2327 send_size += wr->sg_list[i].length;
2328
2329 for proxy-qp0 sends, need to add in size of tunnel header
2330 for tunnel-qp0 sends, tunnel header is already in s/g list
2331 if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
2332 send_size += sizeof(struct mlx4_ib_tunnel_header);
2333
2334 ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
2335
2336 if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
2337 sqp->ud_header.lrh.service_level = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel)
2338 >> 28;
2339 sqp->ud_header.lrh.destination_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
2340 sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
2341 }
2342
2343 mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
2344
2345 force loopback
2346 mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
2347 mlx->rlid = sqp->ud_header.lrh.destination_lid;
2348
2349 sqp->ud_header.lrh.virtual_lane = 0;
2350 sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
2351 ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
2352 sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
2353 if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
2354 sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
2355 else
2356 sqp->ud_header.bth.destination_qpn = cpu_to_be32(
2357 mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);
2358
2359 sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
2360 if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
2361 return -EINVAL;
2362 sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
2363 sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
2364
2365 sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
2366 sqp->ud_header.immediate_present = 0;
2367
2368 header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
2369
2370
2371 * Inline data segments may not cross a 64 byte boundary.  If
2372 * our UD header is bigger than the space available up to the
2373 * next 64 byte boundary in the WQE, use two inline data
2374 * segments to hold the UD header.
2375
2376 spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
2377 if (header_size <= spc) {
2378 inl->byte_count = cpu_to_be32(1U << 31 | header_size);
2379 memcpy(inl + 1, sqp->header_buf, header_size);
2380 i = 1;
2381 } else {
2382 inl->byte_count = cpu_to_be32(1U << 31 | spc);
2383 memcpy(inl + 1, sqp->header_buf, spc);
2384
2385 inl = (void *) (inl + 1) + spc;
2386 memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
2387
2388 * Need a barrier here to make sure all the data is
2389 * visible before the byte_count field is set.
2390 * Otherwise the HCA prefetcher could grab the 64-byte
2391 * chunk with this inline segment and get a valid (!=
2392 * 0xffffffff) byte count but stale data, and end up
2393 * generating a packet with bad headers.
2394 *
2395 * The first inline segment's byte_count field doesn't
2396 * need a barrier, because it comes after a
2397 * control/MLX segment and therefore is at an offset
2398 * of 16 mod 64.
2399
2400 wmb();
2401 inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc));
2402 i = 2;
2403 }
2404
2405 *mlx_seg_len = ALIGN(i * sizeof(struct mlx4_wqe_inline_seg) + header_size, 16);
2406 return 0;
2407 }
2408 */
2409static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
2410		void *wqe, unsigned *mlx_seg_len) {
2411	/*struct ib_device *ib_dev = sqp->qp.ibqp.device;*/
2412	struct mlx4_wqe_mlx_seg *mlx = wqe;
2413	/*struct mlx4_wqe_ctrl_seg *ctrl = wqe;*/
2414	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
2415	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
2416	/*union ib_gid sgid;
2417	 u16 pkey;*/
2418	int send_size;
2419	int header_size;
2420	int spc;
2421	int i;
2422	int is_eth;
2423	int is_vlan = 0;
2424	int is_grh;
2425	/*u16 vlan = 0;
2426	 int err = 0;*/
2427
2428	send_size = 0;
2429	for (i = 0; i < wr->num_sge; ++i)
2430		send_size += wr->sg_list[i].length;
2431
2432	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port)
2433			== IB_LINK_LAYER_ETHERNET;
2434	is_grh = mlx4_ib_ah_grh_present(ah);
2435	if (is_eth) {
2436		assert(!"NYI");
2437		/*if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
2438		 When multi-function is enabled, the ib_core gid
2439		 * indexes don't necessarily match the hw ones, so
2440		 * we must use our own cache
2441		 err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
2442		 be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index,
2443		 &sgid.raw[0]);
2444		 if (err)
2445		 return err;
2446		 } else {
2447		 err = ib_get_cached_gid(ib_dev,
2448		 be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index,
2449		 &sgid);
2450		 if (err)
2451		 return err;
2452		 }
2453
2454		 vlan = rdma_get_vlan_id(&sgid);
2455		 is_vlan = vlan < 0x1000;*/
2456	}
2457	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0,
2458			&sqp->ud_header);
2459
2460	if (!is_eth) {
2461		sqp->ud_header.lrh.service_level = be32_to_cpu(
2462				ah->av.ib.sl_tclass_flowlabel) >> 28;
2463		sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
2464		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
2465	}
2466
2467	if (is_grh) {
2468		assert(!"NYI");
2469		/*sqp->ud_header.grh.traffic_class = (be32_to_cpu(
2470		 ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
2471		 sqp->ud_header.grh.flow_label = ah->av.ib.sl_tclass_flowlabel
2472		 & cpu_to_be32(0xfffff);
2473		 sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
2474		 if (is_eth)
2475		 memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
2476		 else {
2477		 if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
2478		 When multi-function is enabled, the ib_core gid
2479		 * indexes don't necessarily match the hw ones, so
2480		 * we must use our own cache
2481		 sqp->ud_header.grh.source_gid.global.subnet_prefix = to_mdev(
2482		 ib_dev)->sriov.demux[sqp->qp.port - 1].subnet_prefix;
2483		 sqp->ud_header.grh.source_gid.global.interface_id =
2484		 to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].guid_cache[ah->av.ib.gid_index];
2485		 } else
2486		 ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24,
2487		 ah->av.ib.gid_index, &sqp->ud_header.grh.source_gid);
2488		 }
2489		 memcpy(sqp->ud_header.grh.destination_gid.raw, ah->av.ib.dgid, 16);*/
2490	}
2491
2492	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
2493
2494	if (!is_eth) {
2495		mlx->flags |= cpu_to_be32(
2496				(!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0)
2497						| (sqp->ud_header.lrh.destination_lid ==
2498						IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0)
2499						| (sqp->ud_header.lrh.service_level << 8));
2500		if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
2501			mlx->flags |= cpu_to_be32(0x1); /* force loopback */
2502		mlx->rlid = sqp->ud_header.lrh.destination_lid;
2503	}
2504
2505	switch (wr->opcode) {
2506	case IB_WR_SEND:
2507		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
2508		sqp->ud_header.immediate_present = 0;
2509		break;
2510	case IB_WR_SEND_WITH_IMM:
2511		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
2512		sqp->ud_header.immediate_present = 1;
2513		sqp->ud_header.immediate_data = wr->ex.imm_data;
2514		break;
2515	default:
2516		return -EINVAL;
2517	}
2518
2519	if (is_eth) {
2520		assert(!"NYI");
2521		/*u8 smac[6];
2522		 struct in6_addr in6;
2523
2524		 u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
2525
2526		 mlx->sched_prio = cpu_to_be16(pcp);
2527
2528		 memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
2529		 FIXME: cache smac value?
2530		 memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
2531		 memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
2532		 memcpy(&in6, sgid.raw, sizeof(in6));
2533		 rdma_get_ll_mac(&in6, smac);
2534		 memcpy(sqp->ud_header.eth.smac_h, smac, 6);
2535		 if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
2536		 mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
2537		 if (!is_vlan) {
2538		 sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
2539		 } else {
2540		 sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
2541		 sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
2542		 }*/
2543	} else {
2544		sqp->ud_header.lrh.virtual_lane = !sqp->qp.ibqp.qp_num ? 15 : 0;
2545		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
2546			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
2547	}
2548	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
2549	/*if (!sqp->qp.ibqp.qp_num)
2550	 ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
2551	 else
2552	 ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);*/
2553	sqp->ud_header.bth.pkey = 0;/*cpu_to_be16(pkey);*/
2554	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
2555	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
2556	sqp->ud_header.deth.qkey = cpu_to_be32(
2557			wr->wr.ud.remote_qkey & 0x80000000 ?
2558					sqp->qkey : wr->wr.ud.remote_qkey);
2559	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
2560
2561	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
2562
2563	if (0) {
2564		/*pr_err("built UD header of size %d:\n", header_size);
2565		 for (i = 0; i < header_size / 4; ++i) {
2566		 if (i % 8 == 0)
2567		 pr_err("  [%02x] ", i * 4);
2568		 pr_cont(" %08x", be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
2569		 if ((i + 1) % 8 == 0)
2570		 pr_cont("\n");
2571		 }
2572		 pr_err("\n");*/
2573	}
2574
2575	/*
2576	 * Inline data segments may not cross a 64 byte boundary.  If
2577	 * our UD header is bigger than the space available up to the
2578	 * next 64 byte boundary in the WQE, use two inline data
2579	 * segments to hold the UD header.
2580	 */
2581	spc = MLX4_INLINE_ALIGN
2582			- ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
2583	if (header_size <= spc) {
2584		inl->byte_count = cpu_to_be32(1U << 31 | header_size);
2585		memcpy(inl + 1, sqp->header_buf, header_size);
2586		i = 1;
2587	} else {
2588		inl->byte_count = cpu_to_be32(1U << 31 | spc);
2589		memcpy(inl + 1, sqp->header_buf, spc);
2590
2591		inl = (void *) (inl + 1) + spc;
2592		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
2593		/*
2594		 * Need a barrier here to make sure all the data is
2595		 * visible before the byte_count field is set.
2596		 * Otherwise the HCA prefetcher could grab the 64-byte
2597		 * chunk with this inline segment and get a valid (!=
2598		 * 0xffffffff) byte count but stale data, and end up
2599		 * generating a packet with bad headers.
2600		 *
2601		 * The first inline segment's byte_count field doesn't
2602		 * need a barrier, because it comes after a
2603		 * control/MLX segment and therefore is at an offset
2604		 * of 16 mod 64.
2605		 */
2606		wmb();
2607		inl->byte_count = cpu_to_be32(1U << 31 | (header_size - spc));
2608		i = 2;
2609	}
2610
2611	*mlx_seg_len = ALIGN(i * sizeof(struct mlx4_wqe_inline_seg) + header_size,
2612			16);
2613	return 0;
2614}
2615
2616static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq,
2617		struct ib_cq *ib_cq) {
2618	unsigned cur;
2619	struct mlx4_ib_cq *cq;
2620
2621	cur = wq->head - wq->tail;
2622	if (/*likely(*/cur + nreq < wq->max_post/*)*/)
2623		return 0;
2624
2625	cq = to_mcq(ib_cq);
2626	/*spin_lock(&cq->lock);*/
2627	cur = wq->head - wq->tail;
2628	/*spin_unlock(&cq->lock);*/
2629
2630	return cur + nreq >= wq->max_post;
2631}
2632/*
2633 static __be32 convert_access(int acc) {
2634 return (
2635 acc & IB_ACCESS_REMOTE_ATOMIC ?
2636 cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC) : 0)
2637 | (
2638 acc & IB_ACCESS_REMOTE_WRITE ?
2639 cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0)
2640 | (
2641 acc & IB_ACCESS_REMOTE_READ ?
2642 cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ) : 0)
2643 | (acc & IB_ACCESS_LOCAL_WRITE ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE) : 0)
2644 | cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
2645 }
2646
2647 static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr) {
2648 struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
2649 int i;
2650
2651 for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i)
2652 mfrpl->mapped_page_list[i] = cpu_to_be64(
2653 wr->wr.fast_reg.page_list->page_list[i] | MLX4_MTT_FLAG_PRESENT);
2654
2655 fseg->flags = convert_access(wr->wr.fast_reg.access_flags);
2656 fseg->mem_key = cpu_to_be32(wr->wr.fast_reg.rkey);
2657 fseg->buf_list = cpu_to_be64(mfrpl->map);
2658 fseg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
2659 fseg->reg_len = cpu_to_be64(wr->wr.fast_reg.length);
2660 fseg->offset = 0;  XXX -- is this just for ZBVA?
2661 fseg->page_size = cpu_to_be32(wr->wr.fast_reg.page_shift);
2662 fseg->reserved[0] = 0;
2663 fseg->reserved[1] = 0;
2664 }
2665
2666 static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) {
2667 iseg->mem_key = cpu_to_be32(rkey);
2668
2669 iseg->reserved1 = 0;
2670 iseg->reserved2 = 0;
2671 iseg->reserved3[0] = 0;
2672 iseg->reserved3[1] = 0;
2673 }
2674 */
2675static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
2676		u64 remote_addr, u32 rkey) {
2677	rseg->raddr = cpu_to_be64(remote_addr);
2678	rseg->rkey = cpu_to_be32(rkey);
2679	rseg->reserved = 0;
2680}
2681/*
2682 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg,
2683 struct ib_send_wr *wr) {
2684 if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
2685 aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
2686 aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add);
2687 } else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
2688 aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
2689 aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add_mask);
2690 } else {
2691 aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
2692 aseg->compare = 0;
2693 }
2694
2695 }
2696
2697 static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
2698 struct ib_send_wr *wr) {
2699 aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
2700 aseg->swap_add_mask = cpu_to_be64(wr->wr.atomic.swap_mask);
2701 aseg->compare = cpu_to_be64(wr->wr.atomic.compare_add);
2702 aseg->compare_mask = cpu_to_be64(wr->wr.atomic.compare_add_mask);
2703 }
2704
2705 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
2706 struct ib_send_wr *wr) {
2707 memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx4_av));
2708 dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
2709 dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
2710 dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
2711 memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
2712 }
2713
2714 static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
2715 struct mlx4_wqe_datagram_seg *dseg, struct ib_send_wr *wr, enum ib_qp_type qpt) {
2716 union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av;
2717 struct mlx4_av sqp_av = { 0 };
2718 int port = *((u8 *) &av->ib.port_pd) & 0x3;
2719
2720 force loopback
2721 sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
2722 sqp_av.g_slid = av->ib.g_slid & 0x7f;  no GRH
2723 sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel
2724 & cpu_to_be32(0xf0000000);
2725
2726 memcpy(dseg->av, &sqp_av, sizeof(struct mlx4_av));
2727 This function used only for sending on QP1 proxies
2728 dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
2729 Use QKEY from the QP context, which is set by master
2730 dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
2731 }
2732
2733 static void build_tunnel_header(struct ib_send_wr *wr, void *wqe,
2734 unsigned *mlx_seg_len) {
2735 struct mlx4_wqe_inline_seg *inl = wqe;
2736 struct mlx4_ib_tunnel_header hdr;
2737 struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
2738 int spc;
2739 int i;
2740
2741 memcpy(&hdr.av, &ah->av, sizeof hdr.av);
2742 hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
2743 hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index);
2744 hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
2745
2746 spc = MLX4_INLINE_ALIGN - ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
2747 if (sizeof(hdr) <= spc) {
2748 memcpy(inl + 1, &hdr, sizeof(hdr));
2749 wmb();
2750 inl->byte_count = cpu_to_be32(1U << 31 | sizeof(hdr));
2751 i = 1;
2752 } else {
2753 memcpy(inl + 1, &hdr, spc);
2754 wmb();
2755 inl->byte_count = cpu_to_be32(1U << 31 | spc);
2756
2757 inl = (void *) (inl + 1) + spc;
2758 memcpy(inl + 1, (void *) &hdr + spc, sizeof(hdr) - spc);
2759 wmb();
2760 inl->byte_count = cpu_to_be32(1U << 31 | (sizeof(hdr) - spc));
2761 i = 2;
2762 }
2763
2764 *mlx_seg_len = ALIGN(i * sizeof(struct mlx4_wqe_inline_seg) + sizeof(hdr), 16);
2765 }
2766 */
2767static void set_mlx_icrc_seg(void *dseg) {
2768	u32 *t = dseg;
2769	struct mlx4_wqe_inline_seg *iseg = dseg;
2770
2771	t[1] = 0;
2772
2773	/** Need a barrier here before writing the byte_count field to
2774	 * make sure that all the data is visible before the
2775	 * byte_count field is set.  Otherwise, if the segment begins
2776	 * a new cacheline, the HCA prefetcher could grab the 64-byte
2777	 * chunk and get a valid (!= * 0xffffffff) byte count but
2778	 * stale data, and end up sending the wrong data.*/
2779
2780	wmb();
2781
2782	iseg->byte_count = cpu_to_be32((1U << 31) | 4);
2783}
2784
2785static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) {
2786	dseg->lkey = cpu_to_be32(sg->lkey);
2787	dseg->addr = cpu_to_be64(sg->addr);
2788
2789	/** Need a barrier here before writing the byte_count field to
2790	 * make sure that all the data is visible before the
2791	 * byte_count field is set.  Otherwise, if the segment begins
2792	 * a new cacheline, the HCA prefetcher could grab the 64-byte
2793	 * chunk and get a valid (!= * 0xffffffff) byte count but
2794	 * stale data, and end up sending the wrong data.*/
2795
2796	wmb();
2797
2798	dseg->byte_count = cpu_to_be32(sg->length);
2799}
2800
2801static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg) {
2802	dseg->byte_count = cpu_to_be32(sg->length);
2803	dseg->lkey = cpu_to_be32(sg->lkey);
2804	dseg->addr = cpu_to_be64(sg->addr);
2805}
2806/*
2807 static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
2808 struct mlx4_ib_qp *qp, unsigned *lso_seg_len, __be32 *lso_hdr_sz, __be32 *blh) {
2809 unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
2810
2811 if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
2812 *blh = cpu_to_be32(1 << 6);
2813
2814 if (unlikely(
2815 !(qp->flags & MLX4_IB_QP_LSO) && wr->num_sge > qp->sq.max_gs - (halign >> 4)))
2816 return -EINVAL;
2817
2818 memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
2819
2820 *lso_hdr_sz = cpu_to_be32(
2821 (wr->wr.ud.mss - wr->wr.ud.hlen) << 16 | wr->wr.ud.hlen);
2822 *lso_seg_len = halign;
2823 return 0;
2824 }
2825 */
2826static __be32 send_ieth(struct ib_send_wr *wr) {
2827	switch (wr->opcode) {
2828	case IB_WR_SEND_WITH_IMM:
2829	case IB_WR_RDMA_WRITE_WITH_IMM:
2830		return wr->ex.imm_data;
2831
2832	case IB_WR_SEND_WITH_INV:
2833		return cpu_to_be32(wr->ex.invalidate_rkey);
2834
2835	default:
2836		return 0;
2837	}
2838}
2839/*
2840 static void add_zero_len_inline(void *wqe) {
2841 struct mlx4_wqe_inline_seg *inl = wqe;
2842 memset(wqe, 0, 16);
2843 inl->byte_count = cpu_to_be32(1U << 31);
2844 }
2845 */
2846static int lay_inline_data(struct mlx4_ib_qp *qp, struct ib_send_wr *wr,
2847		void *wqe, int *sz) {
2848	struct mlx4_wqe_inline_seg *seg;
2849	void *addr;
2850	int len, seg_len;
2851	int num_seg;
2852	int off, to_copy;
2853	int i;
2854	int inl = 0;
2855
2856	seg = wqe;
2857	wqe += sizeof *seg;
2858	off = ((unsigned long) wqe) & (unsigned long) (MLX4_INLINE_ALIGN - 1);
2859	num_seg = 0;
2860	seg_len = 0;
2861
2862	for (i = 0; i < wr->num_sge; ++i) {
2863		addr = (void *) (unsigned long) (wr->sg_list[i].addr);
2864		len = wr->sg_list[i].length;
2865		inl += len;
2866
2867		if (inl > qp->max_inline_data) {
2868			inl = 0;
2869			return -1;
2870		}
2871
2872		while (len >= MLX4_INLINE_ALIGN - off) {
2873			to_copy = MLX4_INLINE_ALIGN - off;
2874			memcpy(wqe, addr, to_copy);
2875			len -= to_copy;
2876			wqe += to_copy;
2877			addr += to_copy;
2878			seg_len += to_copy;
2879			wmb();
2880			/*see comment below*/
2881			seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
2882			seg_len = 0;
2883			seg = wqe;
2884			wqe += sizeof *seg;
2885			off = sizeof *seg;
2886			++num_seg;
2887		}
2888
2889		memcpy(wqe, addr, len);
2890		wqe += len;
2891		seg_len += len;
2892		off += len;
2893	}
2894
2895	if (seg_len) {
2896		++num_seg;
2897
2898		/** Need a barrier here to make sure
2899		 * all the data is visible before the
2900		 * byte_count field is set.  Otherwise
2901		 * the HCA prefetcher could grab the
2902		 * 64-byte chunk with this inline
2903		 * segment and get a valid (!=
2904		 * 0xffffffff) byte count but stale
2905		 * data, and end up sending the wrong
2906		 * data.*/
2907
2908		wmb();
2909		seg->byte_count = htonl(MLX4_INLINE_SEG | seg_len);
2910	}
2911
2912	*sz = (inl + num_seg * sizeof *seg + 15) / 16;
2913
2914	return 0;
2915}
2916/*
2917
2918 * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
2919 * implementations may use move-string-buffer assembler instructions,
2920 * which do not guarantee order of copying.
2921 */
2922static void mlx4_bf_copy(unsigned long *dst, unsigned long *src,
2923		unsigned bytecnt) {
2924	__iowrite64_copy(dst, src, bytecnt / 8);
2925}
2926
2927int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
2928		struct ib_send_wr **bad_wr) {
2929	struct mlx4_ib_qp *qp = to_mqp(ibqp);
2930	void *wqe;
2931	struct mlx4_wqe_ctrl_seg *uninitialized_var( ctrl);
2932	struct mlx4_wqe_data_seg *dseg;
2933	/*unsigned long flags;*/
2934	int nreq;
2935	int err = 0;
2936	unsigned ind;
2937	int uninitialized_var( stamp);
2938	int uninitialized_var( size);
2939	unsigned uninitialized_var( seglen);
2940	__be32 dummy;
2941	__be32 *lso_wqe;
2942	__be32 uninitialized_var( lso_hdr_sz);
2943	__be32 blh;
2944	int i;
2945	int inl = 0;
2946	u32 *aux;
2947	/*spin_lock_irqsave(&qp->sq.lock, flags);*/
2948
2949	ind = qp->sq_next_wqe;
2950
2951	for (nreq = 0; wr; ++nreq, wr = wr->next) {
2952		lso_wqe = &dummy;
2953		blh = 0;
2954
2955		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
2956			err = -ENOMEM;
2957			*bad_wr = wr;
2958			goto out;
2959		}
2960
2961		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
2962			err = -EINVAL;
2963			*bad_wr = wr;
2964			goto out;
2965		}
2966
2967		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
2968		aux = (u32 *) &ctrl->vlan_tag;
2969		*aux = 0;
2970		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
2971
2972		ctrl->srcrb_flags = (
2973				wr->send_flags & IB_SEND_SIGNALED ?
2974						cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0)
2975				| (wr->send_flags & IB_SEND_SOLICITED ?
2976						cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0)
2977				| ((wr->send_flags & IB_SEND_IP_CSUM) ?
2978						cpu_to_be32(
2979								MLX4_WQE_CTRL_IP_CSUM
2980										| MLX4_WQE_CTRL_TCP_UDP_CSUM) :
2981						0) | qp->sq_signal_bits;
2982
2983		ctrl->imm = send_ieth(wr);
2984
2985		wqe += sizeof *ctrl;
2986		size = sizeof *ctrl / 16;
2987
2988		switch (qp->mlx4_ib_qp_type) {
2989		case MLX4_IB_QPT_RC:
2990		case MLX4_IB_QPT_UC:
2991			switch (wr->opcode) {
2992			case IB_WR_ATOMIC_CMP_AND_SWP:
2993			case IB_WR_ATOMIC_FETCH_AND_ADD:
2994			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
2995				assert(!"NYI");
2996				/*set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
2997				 wr->wr.atomic.rkey);
2998				 wqe += sizeof(struct mlx4_wqe_raddr_seg);
2999
3000				 set_atomic_seg(wqe, wr);
3001				 wqe += sizeof(struct mlx4_wqe_atomic_seg);
3002
3003				 size += (sizeof(struct mlx4_wqe_raddr_seg)
3004				 + sizeof(struct mlx4_wqe_atomic_seg)) / 16;*/
3005
3006				break;
3007
3008			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
3009				assert(!"NYI");
3010				/*set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
3011				 wr->wr.atomic.rkey);
3012				 wqe += sizeof(struct mlx4_wqe_raddr_seg);
3013
3014				 set_masked_atomic_seg(wqe, wr);
3015				 wqe += sizeof(struct mlx4_wqe_masked_atomic_seg);
3016
3017				 size += (sizeof(struct mlx4_wqe_raddr_seg)
3018				 + sizeof(struct mlx4_wqe_masked_atomic_seg)) / 16;*/
3019
3020				break;
3021
3022			case IB_WR_RDMA_READ:
3023			case IB_WR_RDMA_WRITE:
3024			case IB_WR_RDMA_WRITE_WITH_IMM:
3025				set_raddr_seg(wqe, wr->wr.rdma.remote_addr, wr->wr.rdma.rkey);
3026				wqe += sizeof(struct mlx4_wqe_raddr_seg);
3027				size += sizeof(struct mlx4_wqe_raddr_seg) / 16;
3028				break;
3029
3030			case IB_WR_LOCAL_INV:
3031				assert(!"NYI");
3032				/*ctrl->srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
3033				 set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
3034				 wqe += sizeof(struct mlx4_wqe_local_inval_seg);
3035				 size += sizeof(struct mlx4_wqe_local_inval_seg) / 16;*/
3036				break;
3037
3038			case IB_WR_FAST_REG_MR:
3039				assert(!"NYI");
3040				/*ctrl->srcrb_flags |= cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
3041				 set_fmr_seg(wqe, wr);
3042				 wqe += sizeof(struct mlx4_wqe_fmr_seg);
3043				 size += sizeof(struct mlx4_wqe_fmr_seg) / 16;*/
3044				break;
3045
3046			default:
3047				/*No extra segments required for sends*/
3048				break;
3049			}
3050			break;
3051
3052		case MLX4_IB_QPT_TUN_SMI_OWNER:
3053			assert(!"NYI");
3054			/*err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
3055			 if (unlikely(err)) {
3056			 *bad_wr = wr;
3057			 goto out;
3058			 }
3059			 wqe += seglen;
3060			 size += seglen / 16;*/
3061			break;
3062		case MLX4_IB_QPT_TUN_SMI:
3063		case MLX4_IB_QPT_TUN_GSI:
3064			assert(!"NYI");
3065			/*this is a UD qp used in MAD responses to slaves.
3066			 set_datagram_seg(wqe, wr);
3067			 set the forced-loopback bit in the data seg av
3068			 *(__be32 *) wqe |= cpu_to_be32(0x80000000);
3069			 wqe += sizeof(struct mlx4_wqe_datagram_seg);
3070			 size += sizeof(struct mlx4_wqe_datagram_seg) / 16;*/
3071			break;
3072		case MLX4_IB_QPT_UD:
3073			assert(!"NYI");
3074			/*set_datagram_seg(wqe, wr);
3075			 wqe += sizeof(struct mlx4_wqe_datagram_seg);
3076			 size += sizeof(struct mlx4_wqe_datagram_seg) / 16;
3077
3078			 if (wr->opcode == IB_WR_LSO) {
3079			 err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
3080			 if (unlikely(err)) {
3081			 *bad_wr = wr;
3082			 goto out;
3083			 }
3084			 lso_wqe = (__be32 *) wqe;
3085			 wqe += seglen;
3086			 size += seglen / 16;
3087			 }*/
3088			break;
3089
3090		case MLX4_IB_QPT_PROXY_SMI_OWNER:
3091			assert(!"NYI");
3092			/*if (unlikely(!mlx4_is_master(to_mdev(ibqp->device)->dev))) {
3093			 err = -ENOSYS;
3094			 *bad_wr = wr;
3095			 goto out;
3096			 }
3097			 err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
3098			 if (unlikely(err)) {
3099			 *bad_wr = wr;
3100			 goto out;
3101			 }
3102			 wqe += seglen;
3103			 size += seglen / 16;
3104			 to start tunnel header on a cache-line boundary
3105			 add_zero_len_inline(wqe);
3106			 wqe += 16;
3107			 size++;
3108			 build_tunnel_header(wr, wqe, &seglen);
3109			 wqe += seglen;
3110			 size += seglen / 16;*/
3111			break;
3112		case MLX4_IB_QPT_PROXY_SMI:
3113			assert(!"NYI");
3114			/*don't allow QP0 sends on guests*/
3115			/*err = -ENOSYS;
3116			 *bad_wr = wr;*/
3117			goto out;
3118		case MLX4_IB_QPT_PROXY_GSI:
3119			assert(!"NYI");
3120			/*If we are tunneling special qps, this is a UD qp.
3121			 * In this case we first add a UD segment targeting
3122			 * the tunnel qp, and then add a header with address
3123			 * information*/
3124			/*set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr,
3125			 ibqp->qp_type);
3126			 wqe += sizeof(struct mlx4_wqe_datagram_seg);
3127			 size += sizeof(struct mlx4_wqe_datagram_seg) / 16;
3128			 build_tunnel_header(wr, wqe, &seglen);
3129			 wqe += seglen;
3130			 size += seglen / 16;*/
3131			break;
3132
3133		case MLX4_IB_QPT_SMI:
3134		case MLX4_IB_QPT_GSI:
3135			err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
3136			if (unlikely(err)) {
3137				*bad_wr = wr;
3138				goto out;
3139			}
3140			wqe += seglen;
3141			size += seglen / 16;
3142			break;
3143
3144		default:
3145			break;
3146		}
3147
3148		/** Write data segments in reverse order, so as to
3149		 * overwrite cacheline stamp last within each
3150		 * cacheline.  This avoids issues with WQE
3151		 * prefetching.*/
3152
3153		dseg = wqe;
3154		dseg += wr->num_sge - 1;
3155
3156		/*Add one more inline data segment for ICRC for MLX sends*/
3157		if (unlikely(
3158				qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI
3159						|| qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI
3160						|| qp->mlx4_ib_qp_type
3161								& (MLX4_IB_QPT_PROXY_SMI_OWNER
3162										| MLX4_IB_QPT_TUN_SMI_OWNER))) {
3163			set_mlx_icrc_seg(dseg + 1);
3164			size += sizeof(struct mlx4_wqe_data_seg) / 16;
3165		}
3166
3167		if (wr->send_flags & IB_SEND_INLINE && wr->num_sge) {
3168			int sz;
3169			err = lay_inline_data(qp, wr, wqe, &sz);
3170			if (!err) {
3171				inl = 1;
3172				size += sz;
3173			}
3174		} else {
3175			size += wr->num_sge * (sizeof(struct mlx4_wqe_data_seg) / 16);
3176			for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
3177				set_data_seg(dseg, wr->sg_list + i);
3178		}
3179
3180		/** Possibly overwrite stamping in cacheline with LSO
3181		 * segment only after making sure all data segments
3182		 * are written.*/
3183
3184		wmb();
3185		*lso_wqe = lso_hdr_sz;
3186		ctrl->fence_size = (
3187				wr->send_flags & IB_SEND_FENCE ? MLX4_WQE_CTRL_FENCE : 0)
3188				| size;
3189
3190		/** Make sure descriptor is fully written before
3191		 * setting ownership bit (because HW can start
3192		 * executing as soon as we do).*/
3193
3194		wmb();
3195
3196		if (wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
3197			*bad_wr = wr;
3198			err = -EINVAL;
3199			goto out;
3200		}
3201
3202		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode]
3203				| (ind & qp->sq.wqe_cnt ? cpu_to_be32(1U << 31) : 0) | blh;
3204
3205		stamp = ind + qp->sq_spare_wqes;
3206		ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
3207
3208		/** We can improve latency by not stamping the last
3209		 * send queue WQE until after ringing the doorbell, so
3210		 * only stamp here if there are still more WQEs to post.
3211		 *
3212		 * Same optimization applies to padding with NOP wqe
3213		 * in case of WQE shrinking (used to prevent wrap-around
3214		 * in the middle of WR).*/
3215
3216		if (wr->next) {
3217			stamp_send_wqe(qp, stamp, size * 16);
3218			ind = pad_wraparound(qp, ind);
3219		}
3220	}
3221
3222	out: if (nreq == 1 && inl && size > 1 && size < qp->bf.buf_size / 16) {
3223		ctrl->owner_opcode |= htonl((qp->sq_next_wqe & 0xffff) << 8);
3224		/*We set above doorbell_qpn bits to 0 as part of vlan
3225		 * tag initialization, so |= should be correct.*/
3226
3227		aux = (u32 *) &ctrl->vlan_tag;
3228		*aux |= qp->doorbell_qpn;
3229
3230		/** Make sure that descriptor is written to memory
3231		 * before writing to BlueFlame page.*/
3232
3233		wmb();
3234
3235		++qp->sq.head;
3236
3237		mlx4_bf_copy(qp->bf.reg + qp->bf.offset, (unsigned long *) ctrl,
3238				ALIGN(size * 16, 64));
3239		wc_wmb();
3240
3241		qp->bf.offset ^= qp->bf.buf_size;
3242
3243	} else if (nreq) {
3244		qp->sq.head += nreq;
3245
3246		/* * Make sure that descriptors are written before
3247		 * doorbell record.*/
3248
3249		wmb();
3250
3251		writel(qp->doorbell_qpn, qp->bf.uar->map + MLX4_SEND_DOORBELL);
3252
3253		/** Make sure doorbells don't leak out of SQ spinlock
3254		 * and reach the HCA out of order.*/
3255
3256		mmiowb();
3257
3258	}
3259
3260	if (likely(nreq)) {
3261		stamp_send_wqe(qp, stamp, size * 16);
3262		ind = pad_wraparound(qp, ind);
3263		qp->sq_next_wqe = ind;
3264	}
3265
3266	/*spin_unlock_irqrestore(&qp->sq.lock, flags);*/
3267
3268	return err;
3269}
3270
3271int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
3272		struct ib_recv_wr **bad_wr) {
3273	struct mlx4_ib_qp *qp = to_mqp(ibqp);
3274	struct mlx4_wqe_data_seg *scat;
3275	/*unsigned long flags;*/
3276	int err = 0;
3277	int nreq;
3278	int ind;
3279	int max_gs;
3280	int i;
3281
3282	max_gs = qp->rq.max_gs;
3283	/*spin_lock_irqsave(&qp->rq.lock, flags);*/
3284
3285	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
3286
3287	for (nreq = 0; wr; ++nreq, wr = wr->next) {
3288		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
3289			err = -ENOMEM;
3290			*bad_wr = wr;
3291			goto out;
3292		}
3293
3294		if (/*unlikely(*/wr->num_sge > qp->rq.max_gs/*)*/) {
3295			err = -EINVAL;
3296			*bad_wr = wr;
3297			goto out;
3298		}
3299
3300		scat = get_recv_wqe(qp, ind);
3301
3302		if (qp->mlx4_ib_qp_type
3303				& (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI
3304						| MLX4_IB_QPT_PROXY_GSI)) {
3305			/*ib_dma_sync_single_for_device(ibqp->device,
3306			 qp->sqp_proxy_rcv[ind].map,
3307			 sizeof(struct mlx4_ib_proxy_sqp_hdr), DMA_FROM_DEVICE);*/
3308			scat->byte_count = cpu_to_be32(
3309					sizeof(struct mlx4_ib_proxy_sqp_hdr));
3310			/*use dma lkey from upper layer entry*/
3311			scat->lkey = cpu_to_be32(wr->sg_list->lkey);
3312			scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
3313			scat++;
3314			max_gs--;
3315		}
3316
3317		for (i = 0; i < wr->num_sge; ++i)
3318			__set_data_seg(scat + i, wr->sg_list + i);
3319
3320		if (i < max_gs) {
3321			scat[i].byte_count = 0;
3322			scat[i].lkey = cpu_to_be32(MLX4_INVALID_LKEY);
3323			scat[i].addr = 0;
3324		}
3325
3326		qp->rq.wrid[ind] = wr->wr_id;
3327
3328		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
3329	}
3330
3331	out: if (/*likely(*/nreq/*)*/) {
3332		qp->rq.head += nreq;
3333
3334		/** Make sure that descriptors are written before
3335		 * doorbell record.*/
3336
3337		wmb();
3338
3339		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
3340	}
3341
3342	/*spin_unlock_irqrestore(&qp->rq.lock, flags);*/
3343
3344	return err;
3345}
3346/*
3347 static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state) {
3348 switch (mlx4_state) {
3349 case MLX4_QP_STATE_RST:
3350 return IB_QPS_RESET;
3351 case MLX4_QP_STATE_INIT:
3352 return IB_QPS_INIT;
3353 case MLX4_QP_STATE_RTR:
3354 return IB_QPS_RTR;
3355 case MLX4_QP_STATE_RTS:
3356 return IB_QPS_RTS;
3357 case MLX4_QP_STATE_SQ_DRAINING:
3358 case MLX4_QP_STATE_SQD:
3359 return IB_QPS_SQD;
3360 case MLX4_QP_STATE_SQER:
3361 return IB_QPS_SQE;
3362 case MLX4_QP_STATE_ERR:
3363 return IB_QPS_ERR;
3364 default:
3365 return -1;
3366 }
3367 }
3368
3369 static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state) {
3370 switch (mlx4_mig_state) {
3371 case MLX4_QP_PM_ARMED:
3372 return IB_MIG_ARMED;
3373 case MLX4_QP_PM_REARM:
3374 return IB_MIG_REARM;
3375 case MLX4_QP_PM_MIGRATED:
3376 return IB_MIG_MIGRATED;
3377 default:
3378 return -1;
3379 }
3380 }
3381
3382 static int to_ib_qp_access_flags(int mlx4_flags) {
3383 int ib_flags = 0;
3384
3385 if (mlx4_flags & MLX4_QP_BIT_RRE)
3386 ib_flags |= IB_ACCESS_REMOTE_READ;
3387 if (mlx4_flags & MLX4_QP_BIT_RWE)
3388 ib_flags |= IB_ACCESS_REMOTE_WRITE;
3389 if (mlx4_flags & MLX4_QP_BIT_RAE)
3390 ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
3391
3392 return ib_flags;
3393 }
3394
3395 static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev,
3396 struct ib_ah_attr *ib_ah_attr, struct mlx4_qp_path *path) {
3397 struct mlx4_dev *dev = ibdev->dev;
3398 int is_eth;
3399
3400 memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
3401 ib_ah_attr->port_num = path->sched_queue & 0x40 ? 2 : 1;
3402
3403 if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
3404 return;
3405
3406 is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num)
3407 == IB_LINK_LAYER_ETHERNET;
3408 if (is_eth)
3409 ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7)
3410 | ((path->sched_queue & 4) << 1);
3411 else
3412 ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
3413
3414 ib_ah_attr->dlid = be16_to_cpu(path->rlid);
3415 ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
3416 ib_ah_attr->static_rate = path->static_rate ? path->static_rate - 5 : 0;
3417 ib_ah_attr->ah_flags = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
3418 if (ib_ah_attr->ah_flags) {
3419 ib_ah_attr->grh.sgid_index = path->mgid_index;
3420 ib_ah_attr->grh.hop_limit = path->hop_limit;
3421 ib_ah_attr->grh.traffic_class = (be32_to_cpu(path->tclass_flowlabel) >> 20)
3422 & 0xff;
3423 ib_ah_attr->grh.flow_label = be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
3424 memcpy(ib_ah_attr->grh.dgid.raw, path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
3425 }
3426 }
3427
3428 int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
3429 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) {
3430 struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
3431 struct mlx4_ib_qp *qp = to_mqp(ibqp);
3432 struct mlx4_qp_context context;
3433 int mlx4_state;
3434 int err = 0;
3435
3436 mutex_lock(&qp->mutex);
3437
3438 if (qp->state == IB_QPS_RESET) {
3439 qp_attr->qp_state = IB_QPS_RESET;
3440 goto done;
3441 }
3442
3443 err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
3444 if (err) {
3445 err = -EINVAL;
3446 goto out;
3447 }
3448
3449 mlx4_state = be32_to_cpu(context.flags) >> 28;
3450
3451 qp->state = to_ib_qp_state(mlx4_state);
3452 qp_attr->qp_state = qp->state;
3453 qp_attr->path_mtu = context.mtu_msgmax >> 5;
3454 qp_attr->path_mig_state = to_ib_mig_state(
3455 (be32_to_cpu(context.flags) >> 11) & 0x3);
3456 qp_attr->qkey = be32_to_cpu(context.qkey);
3457 qp_attr->rq_psn = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
3458 qp_attr->sq_psn = be32_to_cpu(context.next_send_psn) & 0xffffff;
3459 qp_attr->dest_qp_num = be32_to_cpu(context.remote_qpn) & 0xffffff;
3460 qp_attr->qp_access_flags = to_ib_qp_access_flags(be32_to_cpu(context.params2));
3461
3462 if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
3463 to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
3464 to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
3465 qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
3466 qp_attr->alt_port_num = qp_attr->alt_ah_attr.port_num;
3467 }
3468
3469 qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
3470 if (qp_attr->qp_state == IB_QPS_INIT)
3471 qp_attr->port_num = qp->port;
3472 else
3473 qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
3474
3475 qp_attr->en_sqd_async_notify is only applicable in modify qp
3476 qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
3477
3478 qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
3479
3480 qp_attr->max_dest_rd_atomic = 1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
3481 qp_attr->min_rnr_timer = (be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
3482 qp_attr->timeout = context.pri_path.ackto >> 3;
3483 qp_attr->retry_cnt = (be32_to_cpu(context.params1) >> 16) & 0x7;
3484 qp_attr->rnr_retry = (be32_to_cpu(context.params1) >> 13) & 0x7;
3485 qp_attr->alt_timeout = context.alt_path.ackto >> 3;
3486
3487 done: qp_attr->cur_qp_state = qp_attr->qp_state;
3488 qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt;
3489 qp_attr->cap.max_recv_sge = qp->rq.max_gs;
3490
3491 if (!ibqp->uobject) {
3492 qp_attr->cap.max_send_wr = qp->sq.wqe_cnt;
3493 qp_attr->cap.max_send_sge = qp->sq.max_gs;
3494 } else {
3495 qp_attr->cap.max_send_wr = 0;
3496 qp_attr->cap.max_send_sge = 0;
3497 }
3498
3499
3500 * We don't support inline sends for kernel QPs (yet), and we
3501 * don't know what userspace's value should be.
3502
3503 qp_attr->cap.max_inline_data = 0;
3504
3505 qp_init_attr->cap = qp_attr->cap;
3506
3507 qp_init_attr->create_flags = 0;
3508 if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
3509 qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
3510
3511 if (qp->flags & MLX4_IB_QP_LSO)
3512 qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
3513
3514 if (qp->flags & MLX4_IB_QP_NETIF)
3515 qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP;
3516
3517 qp_init_attr->sq_sig_type =
3518 qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
3519 IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
3520
3521 qp_init_attr->qpg_type = ibqp->qpg_type;
3522 if (ibqp->qpg_type == IB_QPG_PARENT)
3523 qp_init_attr->cap.qpg_tss_mask_sz = qp->qpg_data->qpg_tss_mask_sz;
3524 else
3525 qp_init_attr->cap.qpg_tss_mask_sz = 0;
3526
3527 out: mutex_unlock(&qp->mutex);
3528 return err;
3529 }
3530
3531 */
3532