1/*-
2 * Copyright (c) 2013-2020, Mellanox Technologies, Ltd.  All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 *
25 * $FreeBSD$
26 */
27
28#include "mlx5_ib.h"
29
30#include <dev/mlx5/cmd.h>
31
32static const char *mlx5_ib_cong_params_desc[] = {
33	MLX5_IB_CONG_PARAMS(MLX5_IB_STATS_DESC)
34};
35
36static const char *mlx5_ib_cong_stats_desc[] = {
37	MLX5_IB_CONG_STATS(MLX5_IB_STATS_DESC)
38};
39
40#define	MLX5_IB_INDEX(field) ( \
41    (__offsetof(struct mlx5_ib_congestion, field) - \
42     __offsetof(struct mlx5_ib_congestion, arg[0])) / sizeof(u64))
43#define	MLX5_IB_FLD_MAX(type, field) ((1ULL << __mlx5_bit_sz(type, field)) - 1ULL)
44#define	MLX5_IB_SET_CLIPPED(type, ptr, field, var) do { \
45  /* rangecheck */					\
46  if ((var) > MLX5_IB_FLD_MAX(type, field))		\
47	(var) = MLX5_IB_FLD_MAX(type, field);		\
48  /* set value */					\
49  MLX5_SET(type, ptr, field, var);			\
50} while (0)
51
52#define	CONG_LOCK(dev) sx_xlock(&(dev)->congestion.lock)
53#define	CONG_UNLOCK(dev) sx_xunlock(&(dev)->congestion.lock)
54#define	CONG_LOCKED(dev) sx_xlocked(&(dev)->congestion.lock)
55
56#define	MLX5_IB_RP_CLAMP_TGT_RATE_ATTR			BIT(1)
57#define	MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR		BIT(2)
58#define	MLX5_IB_RP_TIME_RESET_ATTR			BIT(3)
59#define	MLX5_IB_RP_BYTE_RESET_ATTR			BIT(4)
60#define	MLX5_IB_RP_THRESHOLD_ATTR			BIT(5)
61#define	MLX5_IB_RP_AI_RATE_ATTR				BIT(7)
62#define	MLX5_IB_RP_HAI_RATE_ATTR			BIT(8)
63#define	MLX5_IB_RP_MIN_DEC_FAC_ATTR			BIT(9)
64#define	MLX5_IB_RP_MIN_RATE_ATTR			BIT(10)
65#define	MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR	BIT(11)
66#define	MLX5_IB_RP_DCE_TCP_G_ATTR			BIT(12)
67#define	MLX5_IB_RP_DCE_TCP_RTT_ATTR			BIT(13)
68#define	MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR	BIT(14)
69#define	MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR		BIT(15)
70#define	MLX5_IB_RP_GD_ATTR				BIT(16)
71
72#define	MLX5_IB_NP_CNP_DSCP_ATTR			BIT(3)
73#define	MLX5_IB_NP_CNP_PRIO_MODE_ATTR			BIT(4)
74
75enum mlx5_ib_cong_node_type {
76	MLX5_IB_RROCE_ECN_RP = 1,
77	MLX5_IB_RROCE_ECN_NP = 2,
78};
79
80static enum mlx5_ib_cong_node_type
81mlx5_ib_param_to_node(u32 index)
82{
83
84	if (index >= MLX5_IB_INDEX(rp_clamp_tgt_rate) &&
85	    index <= MLX5_IB_INDEX(rp_gd))
86		return MLX5_IB_RROCE_ECN_RP;
87	else
88		return MLX5_IB_RROCE_ECN_NP;
89}
90
91static u64
92mlx5_get_cc_param_val(void *field, u32 index)
93{
94
95	switch (index) {
96	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
97		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
98				clamp_tgt_rate);
99	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
100		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
101				clamp_tgt_rate_after_time_inc);
102	case MLX5_IB_INDEX(rp_time_reset):
103		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
104				rpg_time_reset);
105	case MLX5_IB_INDEX(rp_byte_reset):
106		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
107				rpg_byte_reset);
108	case MLX5_IB_INDEX(rp_threshold):
109		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
110				rpg_threshold);
111	case MLX5_IB_INDEX(rp_ai_rate):
112		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
113				rpg_ai_rate);
114	case MLX5_IB_INDEX(rp_hai_rate):
115		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
116				rpg_hai_rate);
117	case MLX5_IB_INDEX(rp_min_dec_fac):
118		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
119				rpg_min_dec_fac);
120	case MLX5_IB_INDEX(rp_min_rate):
121		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
122				rpg_min_rate);
123	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
124		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
125				rate_to_set_on_first_cnp);
126	case MLX5_IB_INDEX(rp_dce_tcp_g):
127		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
128				dce_tcp_g);
129	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
130		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
131				dce_tcp_rtt);
132	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
133		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
134				rate_reduce_monitor_period);
135	case MLX5_IB_INDEX(rp_initial_alpha_value):
136		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
137				initial_alpha_value);
138	case MLX5_IB_INDEX(rp_gd):
139		return MLX5_GET(cong_control_r_roce_ecn_rp, field,
140				rpg_gd);
141	case MLX5_IB_INDEX(np_cnp_dscp):
142		return MLX5_GET(cong_control_r_roce_ecn_np, field,
143				cnp_dscp);
144	case MLX5_IB_INDEX(np_cnp_prio_mode):
145		return MLX5_GET(cong_control_r_roce_ecn_np, field,
146				cnp_prio_mode);
147	case MLX5_IB_INDEX(np_cnp_prio):
148		return MLX5_GET(cong_control_r_roce_ecn_np, field,
149				cnp_802p_prio);
150	default:
151		return 0;
152	}
153}
154
155static void
156mlx5_ib_set_cc_param_mask_val(void *field, u32 index,
157    u64 var, u32 *attr_mask)
158{
159
160	switch (index) {
161	case MLX5_IB_INDEX(rp_clamp_tgt_rate):
162		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATTR;
163		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
164			 clamp_tgt_rate, var);
165		break;
166	case MLX5_IB_INDEX(rp_clamp_tgt_rate_ati):
167		*attr_mask |= MLX5_IB_RP_CLAMP_TGT_RATE_ATI_ATTR;
168		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
169			 clamp_tgt_rate_after_time_inc, var);
170		break;
171	case MLX5_IB_INDEX(rp_time_reset):
172		*attr_mask |= MLX5_IB_RP_TIME_RESET_ATTR;
173		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
174			 rpg_time_reset, var);
175		break;
176	case MLX5_IB_INDEX(rp_byte_reset):
177		*attr_mask |= MLX5_IB_RP_BYTE_RESET_ATTR;
178		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
179			 rpg_byte_reset, var);
180		break;
181	case MLX5_IB_INDEX(rp_threshold):
182		*attr_mask |= MLX5_IB_RP_THRESHOLD_ATTR;
183		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
184			 rpg_threshold, var);
185		break;
186	case MLX5_IB_INDEX(rp_ai_rate):
187		*attr_mask |= MLX5_IB_RP_AI_RATE_ATTR;
188		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
189			 rpg_ai_rate, var);
190		break;
191	case MLX5_IB_INDEX(rp_hai_rate):
192		*attr_mask |= MLX5_IB_RP_HAI_RATE_ATTR;
193		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
194			 rpg_hai_rate, var);
195		break;
196	case MLX5_IB_INDEX(rp_min_dec_fac):
197		*attr_mask |= MLX5_IB_RP_MIN_DEC_FAC_ATTR;
198		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
199			 rpg_min_dec_fac, var);
200		break;
201	case MLX5_IB_INDEX(rp_min_rate):
202		*attr_mask |= MLX5_IB_RP_MIN_RATE_ATTR;
203		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
204			 rpg_min_rate, var);
205		break;
206	case MLX5_IB_INDEX(rp_rate_to_set_on_first_cnp):
207		*attr_mask |= MLX5_IB_RP_RATE_TO_SET_ON_FIRST_CNP_ATTR;
208		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
209			 rate_to_set_on_first_cnp, var);
210		break;
211	case MLX5_IB_INDEX(rp_dce_tcp_g):
212		*attr_mask |= MLX5_IB_RP_DCE_TCP_G_ATTR;
213		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
214			 dce_tcp_g, var);
215		break;
216	case MLX5_IB_INDEX(rp_dce_tcp_rtt):
217		*attr_mask |= MLX5_IB_RP_DCE_TCP_RTT_ATTR;
218		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
219			 dce_tcp_rtt, var);
220		break;
221	case MLX5_IB_INDEX(rp_rate_reduce_monitor_period):
222		*attr_mask |= MLX5_IB_RP_RATE_REDUCE_MONITOR_PERIOD_ATTR;
223		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
224			 rate_reduce_monitor_period, var);
225		break;
226	case MLX5_IB_INDEX(rp_initial_alpha_value):
227		*attr_mask |= MLX5_IB_RP_INITIAL_ALPHA_VALUE_ATTR;
228		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
229			 initial_alpha_value, var);
230		break;
231	case MLX5_IB_INDEX(rp_gd):
232		*attr_mask |= MLX5_IB_RP_GD_ATTR;
233		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_rp, field,
234			 rpg_gd, var);
235		break;
236	case MLX5_IB_INDEX(np_cnp_dscp):
237		*attr_mask |= MLX5_IB_NP_CNP_DSCP_ATTR;
238		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_dscp, var);
239		break;
240	case MLX5_IB_INDEX(np_cnp_prio_mode):
241		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
242		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_prio_mode, var);
243		break;
244	case MLX5_IB_INDEX(np_cnp_prio):
245		*attr_mask |= MLX5_IB_NP_CNP_PRIO_MODE_ATTR;
246		MLX5_SET(cong_control_r_roce_ecn_np, field, cnp_prio_mode, 0);
247		MLX5_IB_SET_CLIPPED(cong_control_r_roce_ecn_np, field, cnp_802p_prio, var);
248		break;
249	default:
250		break;
251	}
252}
253
254static int
255mlx5_ib_get_all_cc_params(struct mlx5_ib_dev *dev)
256{
257	int outlen = MLX5_ST_SZ_BYTES(query_cong_params_out);
258	enum mlx5_ib_cong_node_type node = 0;
259	void *out;
260	void *field;
261	u32 x;
262	int err = 0;
263
264	out = kzalloc(outlen, GFP_KERNEL);
265	if (!out)
266		return -ENOMEM;
267
268	/* get the current values */
269	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
270		if (node != mlx5_ib_param_to_node(x)) {
271			node = mlx5_ib_param_to_node(x);
272
273			err = mlx5_cmd_query_cong_params(dev->mdev, node, out, outlen);
274			if (err)
275				break;
276		}
277		field = MLX5_ADDR_OF(query_cong_params_out, out, congestion_parameters);
278		dev->congestion.arg[x] = mlx5_get_cc_param_val(field, x);
279	}
280	kfree(out);
281	return err;
282}
283
284static int
285mlx5_ib_set_cc_params(struct mlx5_ib_dev *dev, u32 index, u64 var)
286{
287	int inlen = MLX5_ST_SZ_BYTES(modify_cong_params_in);
288	enum mlx5_ib_cong_node_type node;
289	u32 attr_mask = 0;
290	void *field;
291	void *in;
292	int err;
293
294	in = kzalloc(inlen, GFP_KERNEL);
295	if (!in)
296		return -ENOMEM;
297
298	MLX5_SET(modify_cong_params_in, in, opcode,
299		 MLX5_CMD_OP_MODIFY_CONG_PARAMS);
300
301	node = mlx5_ib_param_to_node(index);
302	MLX5_SET(modify_cong_params_in, in, cong_protocol, node);
303
304	field = MLX5_ADDR_OF(modify_cong_params_in, in, congestion_parameters);
305	mlx5_ib_set_cc_param_mask_val(field, index, var, &attr_mask);
306
307	field = MLX5_ADDR_OF(modify_cong_params_in, in, field_select);
308	MLX5_SET(field_select_r_roce_rp, field, field_select_r_roce_rp,
309		 attr_mask);
310
311	err = mlx5_cmd_modify_cong_params(dev->mdev, in, inlen);
312	kfree(in);
313
314	return err;
315}
316
317static int
318mlx5_ib_cong_params_handler(SYSCTL_HANDLER_ARGS)
319{
320	struct mlx5_ib_dev *dev = arg1;
321	u64 value;
322	int error;
323
324	CONG_LOCK(dev);
325	value = dev->congestion.arg[arg2];
326	if (req != NULL) {
327		error = sysctl_handle_64(oidp, &value, 0, req);
328		if (error || req->newptr == NULL ||
329		    value == dev->congestion.arg[arg2])
330			goto done;
331
332		/* assign new value */
333		dev->congestion.arg[arg2] = value;
334	} else {
335		error = 0;
336	}
337	if (!MLX5_CAP_GEN(dev->mdev, cc_modify_allowed))
338		error = EPERM;
339	else {
340		error = -mlx5_ib_set_cc_params(dev, MLX5_IB_INDEX(arg[arg2]),
341		    dev->congestion.arg[arg2]);
342	}
343done:
344	CONG_UNLOCK(dev);
345
346	return (error);
347}
348
349#define	MLX5_GET_UNALIGNED_64(t,p,f) \
350    (((u64)MLX5_GET(t,p,f##_high) << 32) | MLX5_GET(t,p,f##_low))
351
352static void
353mlx5_ib_read_cong_stats(struct work_struct *work)
354{
355	struct mlx5_ib_dev *dev =
356	    container_of(work, struct mlx5_ib_dev, congestion.dwork.work);
357	const int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
358	void *out;
359
360	out = kzalloc(outlen, GFP_KERNEL);
361	if (!out)
362		goto done;
363
364	CONG_LOCK(dev);
365	if (mlx5_cmd_query_cong_counter(dev->mdev, 0, out, outlen))
366		memset(out, 0, outlen);
367
368	dev->congestion.syndrome =
369	    MLX5_GET(query_cong_statistics_out, out, syndrome);
370	dev->congestion.rp_cur_flows =
371	    MLX5_GET(query_cong_statistics_out, out, rp_cur_flows);
372	dev->congestion.sum_flows =
373	    MLX5_GET(query_cong_statistics_out, out, sum_flows);
374	dev->congestion.rp_cnp_ignored =
375	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_ignored);
376	dev->congestion.rp_cnp_handled =
377	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, rp_cnp_handled);
378	dev->congestion.time_stamp =
379	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, time_stamp);
380	dev->congestion.accumulators_period =
381	    MLX5_GET(query_cong_statistics_out, out, accumulators_period);
382	dev->congestion.np_ecn_marked_roce_packets =
383	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_ecn_marked_roce_packets);
384	dev->congestion.np_cnp_sent =
385	    MLX5_GET_UNALIGNED_64(query_cong_statistics_out, out, np_cnp_sent);
386
387	CONG_UNLOCK(dev);
388	kfree(out);
389
390done:
391	schedule_delayed_work(&dev->congestion.dwork, hz);
392}
393
394void
395mlx5_ib_cleanup_congestion(struct mlx5_ib_dev *dev)
396{
397
398	while (cancel_delayed_work_sync(&dev->congestion.dwork))
399		;
400	sysctl_ctx_free(&dev->congestion.ctx);
401	sx_destroy(&dev->congestion.lock);
402}
403
404int
405mlx5_ib_init_congestion(struct mlx5_ib_dev *dev)
406{
407	struct sysctl_ctx_list *ctx;
408	struct sysctl_oid *parent;
409	struct sysctl_oid *node;
410	int err;
411	u32 x;
412
413	ctx = &dev->congestion.ctx;
414	sysctl_ctx_init(ctx);
415	sx_init(&dev->congestion.lock, "mlx5ibcong");
416	INIT_DELAYED_WORK(&dev->congestion.dwork, mlx5_ib_read_cong_stats);
417
418	if (!MLX5_CAP_GEN(dev->mdev, cc_query_allowed))
419		return (0);
420
421	err = mlx5_ib_get_all_cc_params(dev);
422	if (err)
423		return (err);
424
425	parent = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(dev->ib_dev.dev.kobj.oidp),
426	    OID_AUTO, "cong", CTLFLAG_RW, NULL, "Congestion control");
427	if (parent == NULL)
428		return (-ENOMEM);
429
430	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
431	    OID_AUTO, "conf", CTLFLAG_RW, NULL, "Configuration");
432	if (node == NULL) {
433		sysctl_ctx_free(&dev->congestion.ctx);
434		return (-ENOMEM);
435	}
436
437	for (x = 0; x != MLX5_IB_CONG_PARAMS_NUM; x++) {
438		SYSCTL_ADD_PROC(ctx,
439		    SYSCTL_CHILDREN(node), OID_AUTO,
440		    mlx5_ib_cong_params_desc[2 * x],
441		    CTLTYPE_U64 | CTLFLAG_RWTUN | CTLFLAG_MPSAFE,
442		    dev, x, &mlx5_ib_cong_params_handler, "QU",
443		    mlx5_ib_cong_params_desc[2 * x + 1]);
444	}
445
446	node = SYSCTL_ADD_NODE(ctx, SYSCTL_CHILDREN(parent),
447	    OID_AUTO, "stats", CTLFLAG_RD, NULL, "Statistics");
448	if (node == NULL) {
449		sysctl_ctx_free(&dev->congestion.ctx);
450		return (-ENOMEM);
451	}
452
453	for (x = 0; x != MLX5_IB_CONG_STATS_NUM; x++) {
454		/* read-only SYSCTLs */
455		SYSCTL_ADD_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO,
456		    mlx5_ib_cong_stats_desc[2 * x],
457		    CTLFLAG_RD | CTLFLAG_MPSAFE,
458		    &dev->congestion.arg[x + MLX5_IB_CONG_PARAMS_NUM],
459		    0, mlx5_ib_cong_stats_desc[2 * x + 1]);
460	}
461	schedule_delayed_work(&dev->congestion.dwork, hz);
462	return (0);
463}
464