1/*-
2 * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26#include "opt_rss.h"
27#include "opt_ratelimit.h"
28
29#include <dev/mlx5/mlx5_en/en.h>
30
31#ifdef RATELIMIT
32
33static int mlx5e_rl_open_workers(struct mlx5e_priv *);
34static void mlx5e_rl_close_workers(struct mlx5e_priv *);
35static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
36static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
37    struct sysctl_oid *, const char *name, const char *desc);
38static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
39      struct sysctl_oid *node, const char *name, const char *desc);
40static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
41static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
42static if_snd_tag_modify_t mlx5e_rl_snd_tag_modify;
43static if_snd_tag_query_t mlx5e_rl_snd_tag_query;
44static if_snd_tag_free_t mlx5e_rl_snd_tag_free;
45
46static const struct if_snd_tag_sw mlx5e_rl_snd_tag_sw = {
47	.snd_tag_modify = mlx5e_rl_snd_tag_modify,
48	.snd_tag_query = mlx5e_rl_snd_tag_query,
49	.snd_tag_free = mlx5e_rl_snd_tag_free,
50	.type = IF_SND_TAG_TYPE_RATE_LIMIT
51};
52
53static void
54mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
55    struct mlx5e_sq_param *param)
56{
57	void *sqc = param->sqc;
58	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
59	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
60
61	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
62	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
63	MLX5_SET(wq, wq, pd, rl->priv->pdn);
64
65	param->wq.linear = 1;
66}
67
68static void
69mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
70    struct mlx5e_cq_param *param)
71{
72	void *cqc = param->cqc;
73	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
74
75	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
76	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
77	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
78	MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
79
80	switch (rl->param.tx_coalesce_mode) {
81	case 0:
82		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
83		break;
84	default:
85		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
86			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
87		else
88			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
89		break;
90	}
91}
92
93static void
94mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
95    struct mlx5e_rl_channel_param *cparam)
96{
97	memset(cparam, 0, sizeof(*cparam));
98
99	mlx5e_rl_build_sq_param(rl, &cparam->sq);
100	mlx5e_rl_build_cq_param(rl, &cparam->cq);
101}
102
103static int
104mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
105    struct mlx5e_sq_param *param, int ix)
106{
107	struct mlx5_core_dev *mdev = priv->mdev;
108	void *sqc = param->sqc;
109	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
110	int err;
111
112	/* Create DMA descriptor TAG */
113	if ((err = -bus_dma_tag_create(
114	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
115	    1,				/* any alignment */
116	    0,				/* no boundary */
117	    BUS_SPACE_MAXADDR,		/* lowaddr */
118	    BUS_SPACE_MAXADDR,		/* highaddr */
119	    NULL, NULL,			/* filter, filterarg */
120	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
121	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
122	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
123	    0,				/* flags */
124	    NULL, NULL,			/* lockfunc, lockfuncarg */
125	    &sq->dma_tag)))
126		goto done;
127
128	sq->mkey_be = cpu_to_be32(priv->mr.key);
129	sq->ifp = priv->ifp;
130	sq->priv = priv;
131
132	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
133	    &sq->wq_ctrl);
134	if (err)
135		goto err_free_dma_tag;
136
137	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
138
139	err = mlx5e_alloc_sq_db(sq);
140	if (err)
141		goto err_sq_wq_destroy;
142
143	mlx5e_update_sq_inline(sq);
144
145	return (0);
146
147err_sq_wq_destroy:
148	mlx5_wq_destroy(&sq->wq_ctrl);
149err_free_dma_tag:
150	bus_dma_tag_destroy(sq->dma_tag);
151done:
152	return (err);
153}
154
155static void
156mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
157{
158
159	mlx5e_free_sq_db(sq);
160	mlx5_wq_destroy(&sq->wq_ctrl);
161	bus_dma_tag_destroy(sq->dma_tag);
162}
163
164static int
165mlx5e_rl_query_sq(struct mlx5e_sq *sq)
166{
167	void *out;
168        int inlen;
169        int err;
170
171        inlen = MLX5_ST_SZ_BYTES(query_sq_out);
172        out = mlx5_vzalloc(inlen);
173        if (!out)
174                return -ENOMEM;
175
176        err = mlx5_core_query_sq(sq->priv->mdev, sq->sqn, out);
177        if (err)
178                goto out;
179
180        sq->queue_handle = MLX5_GET(query_sq_out, out, sq_context.queue_handle);
181
182out:
183        kvfree(out);
184        return err;
185}
186
187static int
188mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
189    struct mlx5e_sq_param *param, int ix)
190{
191	int err;
192
193	err = mlx5e_rl_create_sq(priv, sq, param, ix);
194	if (err)
195		return (err);
196
197	err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
198	if (err)
199		goto err_destroy_sq;
200
201	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
202	if (err)
203		goto err_disable_sq;
204
205	if (MLX5_CAP_QOS(priv->mdev, qos_remap_pp)) {
206		err = mlx5e_rl_query_sq(sq);
207		if (err) {
208			mlx5_en_err(priv->ifp, "Failed retrieving send queue handle for"
209			    "SQ remap - sqn=%u, err=(%d)\n", sq->sqn, err);
210			sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
211		}
212	} else
213		sq->queue_handle = MLX5_INVALID_QUEUE_HANDLE;
214
215	WRITE_ONCE(sq->running, 1);
216
217	return (0);
218
219err_disable_sq:
220	mlx5e_disable_sq(sq);
221err_destroy_sq:
222	mlx5e_rl_destroy_sq(sq);
223
224	return (err);
225}
226
227static void
228mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
229{
230	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
231	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
232
233	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
234
235	sq->cev_factor = priv->rl.param.tx_completion_fact;
236
237	/* ensure the TX completion event factor is not zero */
238	if (sq->cev_factor == 0)
239		sq->cev_factor = 1;
240}
241
242static int
243mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
244    struct mlx5e_rl_channel_param *cparam,
245    struct mlx5e_sq *volatile *ppsq)
246{
247	struct mlx5e_priv *priv = rlw->priv;
248	struct mlx5e_sq *sq;
249	int err;
250
251	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
252
253	/* init mutexes */
254	mlx5e_rl_chan_mtx_init(priv, sq);
255
256	/* open TX completion queue */
257	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
258	    &mlx5e_tx_cq_comp, eq_ix);
259	if (err)
260		goto err_free;
261
262	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
263	if (err)
264		goto err_close_tx_cq;
265
266	/* store TX channel pointer */
267	*ppsq = sq;
268
269	/* poll TX queue initially */
270	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
271
272	return (0);
273
274err_close_tx_cq:
275	mlx5e_close_cq(&sq->cq);
276
277err_free:
278	/* destroy mutexes */
279	mtx_destroy(&sq->lock);
280	mtx_destroy(&sq->comp_lock);
281	free(sq, M_MLX5EN);
282	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
283	return (err);
284}
285
286static void
287mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
288{
289	struct mlx5e_sq *sq = *ppsq;
290
291	/* check if channel is already closed */
292	if (sq == NULL)
293		return;
294	/* ensure channel pointer is no longer used */
295	*ppsq = NULL;
296
297	/* teardown and destroy SQ */
298	mlx5e_drain_sq(sq);
299	mlx5e_disable_sq(sq);
300	mlx5e_rl_destroy_sq(sq);
301
302	/* close CQ */
303	mlx5e_close_cq(&sq->cq);
304
305	/* destroy mutexes */
306	mtx_destroy(&sq->lock);
307	mtx_destroy(&sq->comp_lock);
308
309	free(sq, M_MLX5EN);
310}
311
312static void
313mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
314{
315	/*
316	 * Limit the maximum distance between completion events to
317	 * half of the currently set TX queue size.
318	 *
319	 * The maximum number of queue entries a single IP packet can
320	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
321	 *
322	 * The worst case max value is then given as below:
323	 */
324	uint64_t max = rl->param.tx_queue_size /
325	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
326
327	/*
328	 * Update the maximum completion factor value in case the
329	 * tx_queue_size field changed. Ensure we don't overflow
330	 * 16-bits.
331	 */
332	if (max < 1)
333		max = 1;
334	else if (max > 65535)
335		max = 65535;
336	rl->param.tx_completion_fact_max = max;
337
338	/*
339	 * Verify that the current TX completion factor is within the
340	 * given limits:
341	 */
342	if (rl->param.tx_completion_fact < 1)
343		rl->param.tx_completion_fact = 1;
344	else if (rl->param.tx_completion_fact > max)
345		rl->param.tx_completion_fact = max;
346}
347
348static int
349mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
350{
351	struct mlx5e_priv *priv = sq->priv;
352	struct mlx5_core_dev *mdev = priv->mdev;
353
354	void *in;
355	void *sqc;
356	int inlen;
357	int err;
358
359	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
360	in = mlx5_vzalloc(inlen);
361	if (in == NULL)
362		return (-ENOMEM);
363
364	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
365
366	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
367	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
368	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
369	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
370	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
371
372	err = mlx5_core_modify_sq(mdev, in, inlen);
373
374	kvfree(in);
375
376	return (err);
377}
378
379/*
380 * This function will search the configured rate limit table for the
381 * best match to avoid that a single socket based application can
382 * allocate all the available hardware rates. If the user selected
383 * rate deviates too much from the closes rate available in the rate
384 * limit table, unlimited rate will be selected.
385 */
386static uint64_t
387mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
388{
389	uint64_t distance = -1ULL;
390	uint64_t diff;
391	uint64_t retval = 0;		/* unlimited */
392	uint64_t x;
393
394	/* search for closest rate */
395	for (x = 0; x != rl->param.tx_rates_def; x++) {
396		uint64_t rate = rl->rate_limit_table[x];
397		if (rate == 0)
398			continue;
399
400		if (rate > user_rate)
401			diff = rate - user_rate;
402		else
403			diff = user_rate - rate;
404
405		/* check if distance is smaller than previous rate */
406		if (diff < distance) {
407			distance = diff;
408			retval = rate;
409		}
410	}
411
412	/* range check for multiplication below */
413	if (user_rate > rl->param.tx_limit_max)
414		user_rate = rl->param.tx_limit_max;
415
416	/* fallback to unlimited, if rate deviates too much */
417	if (distance > howmany(user_rate *
418	    rl->param.tx_allowed_deviation, 1000ULL))
419		retval = 0;
420
421	return (retval);
422}
423
424static int
425mlx5e_rl_post_sq_remap_wqe(struct mlx5e_iq *iq, u32 scq_handle, u32 sq_handle,
426    struct mlx5e_rl_channel *sq_channel)
427{
428	const u32 ds_cnt = DIV_ROUND_UP(sizeof(struct mlx5e_tx_qos_remap_wqe),
429	            MLX5_SEND_WQE_DS);
430	struct mlx5e_tx_qos_remap_wqe *wqe;
431	int pi;
432
433	mtx_lock(&iq->lock);
434	pi = mlx5e_iq_get_producer_index(iq);
435	if (pi < 0) {
436		mtx_unlock(&iq->lock);
437		return (-ENOMEM);
438	}
439	wqe = mlx5_wq_cyc_get_wqe(&iq->wq, pi);
440
441	memset(wqe, 0, sizeof(*wqe));
442
443	wqe->qos_remap.qos_handle = cpu_to_be32(scq_handle);
444	wqe->qos_remap.queue_handle = cpu_to_be32(sq_handle);
445
446	wqe->ctrl.opmod_idx_opcode = cpu_to_be32((iq->pc << 8) |
447	    MLX5_OPCODE_QOS_REMAP);
448	wqe->ctrl.qpn_ds = cpu_to_be32((iq->sqn << 8) | ds_cnt);
449	wqe->ctrl.imm = cpu_to_be32(iq->priv->tisn[0] << 8);
450	wqe->ctrl.fm_ce_se = MLX5_WQE_CTRL_CQ_UPDATE | MLX5_FENCE_MODE_INITIATOR_SMALL;
451
452	/* copy data for doorbell */
453	memcpy(iq->doorbell.d32, &wqe->ctrl, sizeof(iq->doorbell.d32));
454
455	iq->data[pi].num_wqebbs = DIV_ROUND_UP(ds_cnt, MLX5_SEND_WQEBB_NUM_DS);
456	iq->data[pi].p_refcount = &sq_channel->refcount;
457	atomic_add_int(iq->data[pi].p_refcount, 1);
458	iq->pc += iq->data[pi].num_wqebbs;
459
460	mlx5e_iq_notify_hw(iq);
461
462	mtx_unlock(&iq->lock);
463
464	return (0); /* success */
465}
466
467static int
468mlx5e_rl_remap_sq(struct mlx5e_sq *sq, uint16_t index,
469    struct mlx5e_rl_channel *sq_channel)
470{
471	struct mlx5e_channel *iq_channel;
472	u32	scq_handle;
473	u32	sq_handle;
474	int 	error;
475
476	/* Specific SQ remap operations should be handled by same IQ */
477	iq_channel = &sq->priv->channel[sq->sqn % sq->priv->params.num_channels];
478
479	sq_handle = sq->queue_handle;
480	scq_handle = mlx5_rl_get_scq_handle(sq->priv->mdev, index);
481
482	if (sq_handle == MLX5_INVALID_QUEUE_HANDLE ||
483	    scq_handle == MLX5_INVALID_QUEUE_HANDLE)
484		error = -1;
485	else
486		error = mlx5e_rl_post_sq_remap_wqe(&iq_channel->iq, scq_handle,
487		    sq_handle, sq_channel);
488
489	return (error);
490}
491
492/*
493 * This function sets the requested rate for a rate limit channel, in
494 * bits per second. The requested rate will be filtered through the
495 * find best rate function above.
496 */
497static int
498mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
499    struct mlx5e_rl_channel *channel, uint64_t rate)
500{
501	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
502	struct mlx5e_sq *sq;
503	uint64_t temp;
504	uint16_t index;
505	uint16_t burst;
506	int error;
507	bool use_sq_remap;
508
509	if (rate != 0) {
510		MLX5E_RL_WORKER_UNLOCK(rlw);
511
512		MLX5E_RL_RLOCK(rl);
513
514		/* get current burst size in bytes */
515		temp = rl->param.tx_burst_size *
516		    MLX5E_SW2HW_MTU(if_getmtu(rlw->priv->ifp));
517
518		/* limit burst size to 64K currently */
519		if (temp > 65535)
520			temp = 65535;
521		burst = temp;
522
523		/* find best rate */
524		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
525
526		MLX5E_RL_RUNLOCK(rl);
527
528		if (rate == 0) {
529			/* rate doesn't exist, fallback to unlimited */
530			index = 0;
531			rate = 0;
532			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
533		} else {
534			/* get a reference on the new rate */
535			error = -mlx5_rl_add_rate(rlw->priv->mdev,
536			    howmany(rate, 1000), burst, &index);
537
538			if (error != 0) {
539				/* adding rate failed, fallback to unlimited */
540				index = 0;
541				rate = 0;
542				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
543			}
544		}
545		MLX5E_RL_WORKER_LOCK(rlw);
546	} else {
547		index = 0;
548		burst = 0;	/* default */
549	}
550
551	/* paced <--> non-paced transitions must go via FW */
552	use_sq_remap = MLX5_CAP_QOS(rlw->priv->mdev, qos_remap_pp) &&
553	    channel->last_rate != 0 && rate != 0;
554
555	/* atomically swap rates */
556	temp = channel->last_rate;
557	channel->last_rate = rate;
558	rate = temp;
559
560	/* atomically swap burst size */
561	temp = channel->last_burst;
562	channel->last_burst = burst;
563	burst = temp;
564
565	MLX5E_RL_WORKER_UNLOCK(rlw);
566	/* put reference on the old rate, if any */
567	if (rate != 0) {
568		mlx5_rl_remove_rate(rlw->priv->mdev,
569		    howmany(rate, 1000), burst);
570	}
571
572	/* set new rate, if SQ is running */
573	sq = channel->sq;
574	if (sq != NULL && READ_ONCE(sq->running) != 0) {
575		if (!use_sq_remap || mlx5e_rl_remap_sq(sq, index, channel)) {
576			while (atomic_load_int(&channel->refcount) != 0 &&
577			    rlw->priv->mdev->state != MLX5_DEVICE_STATE_INTERNAL_ERROR &&
578		            pci_channel_offline(rlw->priv->mdev->pdev) == 0)
579				pause("W", 1);
580			error = mlx5e_rl_modify_sq(sq, index);
581			if (error != 0)
582				atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
583		}
584	} else
585		error = 0;
586
587	MLX5E_RL_WORKER_LOCK(rlw);
588
589	return (-error);
590}
591
592static void
593mlx5e_rl_worker(void *arg)
594{
595	struct thread *td;
596	struct mlx5e_rl_worker *rlw = arg;
597	struct mlx5e_rl_channel *channel;
598	struct mlx5e_priv *priv;
599	unsigned ix;
600	uint64_t x;
601	int error;
602
603	/* set thread priority */
604	td = curthread;
605
606	thread_lock(td);
607	sched_prio(td, PI_SWI(SWI_NET));
608	thread_unlock(td);
609
610	priv = rlw->priv;
611
612	/* compute completion vector */
613	ix = (rlw - priv->rl.workers) %
614	    priv->mdev->priv.eq_table.num_comp_vectors;
615
616	/* TODO bind to CPU */
617
618	/* open all the SQs */
619	MLX5E_RL_WORKER_LOCK(rlw);
620	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
621		struct mlx5e_rl_channel *channel = rlw->channels + x;
622
623#if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
624		if (channel->state == MLX5E_RL_ST_FREE)
625			continue;
626#endif
627		MLX5E_RL_WORKER_UNLOCK(rlw);
628
629		MLX5E_RL_RLOCK(&priv->rl);
630		error = mlx5e_rl_open_channel(rlw, ix,
631		    &priv->rl.chan_param, &channel->sq);
632		MLX5E_RL_RUNLOCK(&priv->rl);
633
634		MLX5E_RL_WORKER_LOCK(rlw);
635		if (error != 0) {
636			mlx5_en_err(priv->ifp,
637			    "mlx5e_rl_open_channel failed: %d\n", error);
638			break;
639		}
640		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
641	}
642	while (1) {
643		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
644			/* check if we are tearing down */
645			if (rlw->worker_done != 0)
646				break;
647			cv_wait(&rlw->cv, &rlw->mtx);
648		}
649		/* check if we are tearing down */
650		if (rlw->worker_done != 0)
651			break;
652		channel = STAILQ_FIRST(&rlw->process_head);
653		if (channel != NULL) {
654			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
655
656			switch (channel->state) {
657			case MLX5E_RL_ST_MODIFY:
658				channel->state = MLX5E_RL_ST_USED;
659				MLX5E_RL_WORKER_UNLOCK(rlw);
660
661				/* create channel by demand */
662				if (channel->sq == NULL) {
663					MLX5E_RL_RLOCK(&priv->rl);
664					error = mlx5e_rl_open_channel(rlw, ix,
665					    &priv->rl.chan_param, &channel->sq);
666					MLX5E_RL_RUNLOCK(&priv->rl);
667
668					if (error != 0) {
669						mlx5_en_err(priv->ifp,
670						    "mlx5e_rl_open_channel failed: %d\n", error);
671					} else {
672						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
673					}
674				} else {
675					mlx5e_resume_sq(channel->sq);
676				}
677
678				MLX5E_RL_WORKER_LOCK(rlw);
679				/* convert from bytes/s to bits/s and set new rate */
680				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
681				    channel->new_rate * 8ULL);
682				if (error != 0) {
683					mlx5_en_err(priv->ifp,
684					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
685					    error);
686				}
687				break;
688
689			case MLX5E_RL_ST_DESTROY:
690				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
691				if (error != 0) {
692					mlx5_en_err(priv->ifp,
693					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
694					    error);
695				}
696				if (channel->sq != NULL) {
697					/*
698					 * Make sure all packets are
699					 * transmitted before SQ is
700					 * returned to free list:
701					 */
702					MLX5E_RL_WORKER_UNLOCK(rlw);
703					mlx5e_drain_sq(channel->sq);
704					MLX5E_RL_WORKER_LOCK(rlw);
705				}
706				/* put the channel back into the free list */
707				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
708				channel->state = MLX5E_RL_ST_FREE;
709				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
710				break;
711			default:
712				/* NOP */
713				break;
714			}
715		}
716	}
717
718	/* close all the SQs */
719	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
720		struct mlx5e_rl_channel *channel = rlw->channels + x;
721
722		/* update the initial rate */
723		channel->init_rate = channel->last_rate;
724
725		/* make sure we free up the rate resource */
726		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
727
728		if (channel->sq != NULL) {
729			MLX5E_RL_WORKER_UNLOCK(rlw);
730			mlx5e_rl_close_channel(&channel->sq);
731			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
732			MLX5E_RL_WORKER_LOCK(rlw);
733		}
734	}
735
736	rlw->worker_done = 0;
737	cv_broadcast(&rlw->cv);
738	MLX5E_RL_WORKER_UNLOCK(rlw);
739
740	kthread_exit();
741}
742
743static int
744mlx5e_rl_open_tis(struct mlx5e_priv *priv)
745{
746	struct mlx5_core_dev *mdev = priv->mdev;
747	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
748	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
749
750	memset(in, 0, sizeof(in));
751
752	MLX5_SET(tisc, tisc, prio, 0);
753	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
754
755	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
756}
757
758static void
759mlx5e_rl_close_tis(struct mlx5e_priv *priv)
760{
761	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn, 0);
762}
763
764static void
765mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
766    struct mlx5_core_dev *mdev)
767{
768	/* ratelimit workers */
769	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
770	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
771
772	/* range check */
773	if (param->tx_worker_threads_def == 0 ||
774	    param->tx_worker_threads_def > param->tx_worker_threads_max)
775		param->tx_worker_threads_def = param->tx_worker_threads_max;
776
777	/* ratelimit channels */
778	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
779	    param->tx_worker_threads_def;
780	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
781
782	/* range check */
783	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
784		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
785
786	/* set default burst size */
787	param->tx_burst_size = 4;	/* MTUs */
788
789	/*
790	 * Set maximum burst size
791	 *
792	 * The burst size is multiplied by the MTU and clamped to the
793	 * range 0 ... 65535 bytes inclusivly before fed into the
794	 * firmware.
795	 *
796	 * NOTE: If the burst size or MTU is changed only ratelimit
797	 * connections made after the change will use the new burst
798	 * size.
799	 */
800	param->tx_burst_size_max = 255;
801
802	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
803	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
804	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
805
806	/* ratelimit table size */
807	param->tx_rates_max = mdev->priv.rl_table.max_size;
808
809	/* range check */
810	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
811		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
812
813	/* set default number of rates */
814	param->tx_rates_def = param->tx_rates_max;
815
816	/* set maximum allowed rate deviation */
817	if (param->tx_limit_max != 0) {
818		/*
819		 * Make sure the deviation multiplication doesn't
820		 * overflow unsigned 64-bit:
821		 */
822		param->tx_allowed_deviation_max = -1ULL /
823		    param->tx_limit_max;
824	}
825	/* set default rate deviation */
826	param->tx_allowed_deviation = 50;	/* 5.0% */
827
828	/* channel parameters */
829	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
830	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
831	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
832	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
833	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
834}
835
836static const char *mlx5e_rl_params_desc[] = {
837	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
838};
839
840static const char *mlx5e_rl_table_params_desc[] = {
841	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
842};
843
844static const char *mlx5e_rl_stats_desc[] = {
845	MLX5E_RL_STATS(MLX5E_STATS_DESC)
846};
847
848int
849mlx5e_rl_init(struct mlx5e_priv *priv)
850{
851	struct mlx5e_rl_priv_data *rl = &priv->rl;
852	struct sysctl_oid *node;
853	struct sysctl_oid *stats;
854	char buf[64];
855	uint64_t i;
856	uint64_t j;
857	int error;
858
859	/* check if there is support for packet pacing */
860	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
861		return (0);
862
863	rl->priv = priv;
864
865	sysctl_ctx_init(&rl->ctx);
866
867	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
868
869	/* open own TIS domain for ratelimit SQs */
870	error = mlx5e_rl_open_tis(priv);
871	if (error)
872		goto done;
873
874	/* setup default value for parameters */
875	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
876
877	/* update the completion factor */
878	mlx5e_rl_sync_tx_completion_fact(rl);
879
880	/* create root node */
881	node = SYSCTL_ADD_NODE(&rl->ctx,
882	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
883	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
884
885	if (node != NULL) {
886		/* create SYSCTLs */
887		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
888			mlx5e_rl_sysctl_add_u64_oid(rl,
889			    MLX5E_RL_PARAMS_INDEX(arg[i]),
890			    node, mlx5e_rl_params_desc[2 * i],
891			    mlx5e_rl_params_desc[2 * i + 1]);
892		}
893
894		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
895		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
896		    "Rate limiting statistics");
897		if (stats != NULL) {
898			/* create SYSCTLs */
899			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
900				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
901				    stats, mlx5e_rl_stats_desc[2 * i],
902				    mlx5e_rl_stats_desc[2 * i + 1]);
903			}
904		}
905	}
906
907	/* allocate workers array */
908	rl->workers = malloc(sizeof(rl->workers[0]) *
909	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
910
911	/* allocate rate limit array */
912	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
913	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
914
915	if (node != NULL) {
916		/* create more SYSCTls */
917		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
918		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
919		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
920		    "A", "Show table of all configured TX rates");
921
922		/* try to fetch rate table from kernel environment */
923		for (i = 0; i != rl->param.tx_rates_def; i++) {
924			/* compute path for tunable */
925			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
926			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
927			if (TUNABLE_QUAD_FETCH(buf, &j))
928				mlx5e_rl_tx_limit_add(rl, j);
929		}
930
931		/* setup rate table sysctls */
932		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
933			mlx5e_rl_sysctl_add_u64_oid(rl,
934			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
935			    node, mlx5e_rl_table_params_desc[2 * i],
936			    mlx5e_rl_table_params_desc[2 * i + 1]);
937		}
938	}
939
940	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
941		struct mlx5e_rl_worker *rlw = rl->workers + j;
942
943		rlw->priv = priv;
944
945		cv_init(&rlw->cv, "mlx5-worker-cv");
946		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
947		STAILQ_INIT(&rlw->index_list_head);
948		STAILQ_INIT(&rlw->process_head);
949
950		rlw->channels = malloc(sizeof(rlw->channels[0]) *
951		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
952
953		MLX5E_RL_WORKER_LOCK(rlw);
954		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
955			struct mlx5e_rl_channel *channel = rlw->channels + i;
956			channel->worker = rlw;
957			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
958		}
959		MLX5E_RL_WORKER_UNLOCK(rlw);
960	}
961
962	PRIV_LOCK(priv);
963	error = mlx5e_rl_open_workers(priv);
964	PRIV_UNLOCK(priv);
965
966	if (error != 0) {
967		mlx5_en_err(priv->ifp,
968		    "mlx5e_rl_open_workers failed: %d\n", error);
969	}
970
971	return (0);
972
973done:
974	sysctl_ctx_free(&rl->ctx);
975	sx_destroy(&rl->rl_sxlock);
976	return (error);
977}
978
979static int
980mlx5e_rl_open_workers(struct mlx5e_priv *priv)
981{
982	struct mlx5e_rl_priv_data *rl = &priv->rl;
983	struct thread *rl_thread = NULL;
984	struct proc *rl_proc = NULL;
985	uint64_t j;
986	int error;
987
988	if (priv->gone || rl->opened)
989		return (-EINVAL);
990
991	MLX5E_RL_WLOCK(rl);
992	/* compute channel parameters once */
993	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
994	MLX5E_RL_WUNLOCK(rl);
995
996	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
997		struct mlx5e_rl_worker *rlw = rl->workers + j;
998
999		/* start worker thread */
1000		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
1001		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
1002		if (error != 0) {
1003			mlx5_en_err(rl->priv->ifp,
1004			    "kproc_kthread_add failed: %d\n", error);
1005			rlw->worker_done = 1;
1006		}
1007	}
1008
1009	rl->opened = 1;
1010
1011	return (0);
1012}
1013
1014static void
1015mlx5e_rl_close_workers(struct mlx5e_priv *priv)
1016{
1017	struct mlx5e_rl_priv_data *rl = &priv->rl;
1018	uint64_t y;
1019
1020	if (rl->opened == 0)
1021		return;
1022
1023	/* tear down worker threads simultaneously */
1024	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1025		struct mlx5e_rl_worker *rlw = rl->workers + y;
1026
1027		/* tear down worker before freeing SQs */
1028		MLX5E_RL_WORKER_LOCK(rlw);
1029		if (rlw->worker_done == 0) {
1030			rlw->worker_done = 1;
1031			cv_broadcast(&rlw->cv);
1032		} else {
1033			/* XXX thread not started */
1034			rlw->worker_done = 0;
1035		}
1036		MLX5E_RL_WORKER_UNLOCK(rlw);
1037	}
1038
1039	/* wait for worker threads to exit */
1040	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1041		struct mlx5e_rl_worker *rlw = rl->workers + y;
1042
1043		/* tear down worker before freeing SQs */
1044		MLX5E_RL_WORKER_LOCK(rlw);
1045		while (rlw->worker_done != 0)
1046			cv_wait(&rlw->cv, &rlw->mtx);
1047		MLX5E_RL_WORKER_UNLOCK(rlw);
1048	}
1049
1050	rl->opened = 0;
1051}
1052
1053static void
1054mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
1055{
1056	unsigned x;
1057
1058	MLX5E_RL_WLOCK(rl);
1059	for (x = 0; x != rl->param.tx_rates_def; x++)
1060		rl->rate_limit_table[x] = 0;
1061	MLX5E_RL_WUNLOCK(rl);
1062}
1063
1064void
1065mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1066{
1067	struct mlx5e_rl_priv_data *rl = &priv->rl;
1068	uint64_t y;
1069
1070	/* check if there is support for packet pacing */
1071	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
1072		return;
1073
1074	/* TODO check if there is support for packet pacing */
1075
1076	sysctl_ctx_free(&rl->ctx);
1077
1078	PRIV_LOCK(priv);
1079	mlx5e_rl_close_workers(priv);
1080	PRIV_UNLOCK(priv);
1081
1082	mlx5e_rl_reset_rates(rl);
1083
1084	/* close TIS domain */
1085	mlx5e_rl_close_tis(priv);
1086
1087	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
1088		struct mlx5e_rl_worker *rlw = rl->workers + y;
1089
1090		cv_destroy(&rlw->cv);
1091		mtx_destroy(&rlw->mtx);
1092		free(rlw->channels, M_MLX5EN);
1093	}
1094	free(rl->rate_limit_table, M_MLX5EN);
1095	free(rl->workers, M_MLX5EN);
1096	sx_destroy(&rl->rl_sxlock);
1097}
1098
1099static void
1100mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
1101    struct mlx5e_rl_channel *channel)
1102{
1103	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
1104	cv_broadcast(&rlw->cv);
1105}
1106
1107static void
1108mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
1109{
1110	if (channel == NULL)
1111		return;
1112
1113	MLX5E_RL_WORKER_LOCK(rlw);
1114	switch (channel->state) {
1115	case MLX5E_RL_ST_MODIFY:
1116		channel->state = MLX5E_RL_ST_DESTROY;
1117		break;
1118	case MLX5E_RL_ST_USED:
1119		channel->state = MLX5E_RL_ST_DESTROY;
1120		mlx5e_rlw_queue_channel_locked(rlw, channel);
1121		break;
1122	default:
1123		break;
1124	}
1125	MLX5E_RL_WORKER_UNLOCK(rlw);
1126}
1127
1128static int
1129mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1130{
1131
1132	MLX5E_RL_WORKER_LOCK(rlw);
1133	channel->new_rate = rate;
1134	switch (channel->state) {
1135	case MLX5E_RL_ST_USED:
1136		channel->state = MLX5E_RL_ST_MODIFY;
1137		mlx5e_rlw_queue_channel_locked(rlw, channel);
1138		break;
1139	default:
1140		break;
1141	}
1142	MLX5E_RL_WORKER_UNLOCK(rlw);
1143
1144	return (0);
1145}
1146
1147static int
1148mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1149    union if_snd_tag_query_params *params)
1150{
1151	int retval;
1152
1153	MLX5E_RL_WORKER_LOCK(rlw);
1154	switch (channel->state) {
1155	case MLX5E_RL_ST_USED:
1156		params->rate_limit.max_rate = channel->last_rate;
1157		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1158		retval = 0;
1159		break;
1160	case MLX5E_RL_ST_MODIFY:
1161		params->rate_limit.max_rate = channel->last_rate;
1162		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1163		retval = EBUSY;
1164		break;
1165	default:
1166		retval = EINVAL;
1167		break;
1168	}
1169	MLX5E_RL_WORKER_UNLOCK(rlw);
1170
1171	return (retval);
1172}
1173
1174static int
1175mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1176    struct mlx5e_rl_channel **pchannel)
1177{
1178	struct mlx5e_rl_channel *channel;
1179	int retval = ENOMEM;
1180
1181	MLX5E_RL_WORKER_LOCK(rlw);
1182	/* Check for available channel in free list */
1183	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1184		retval = 0;
1185		/* Remove head index from available list */
1186		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1187		channel->state = MLX5E_RL_ST_USED;
1188		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1189	} else {
1190		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1191	}
1192	MLX5E_RL_WORKER_UNLOCK(rlw);
1193
1194	*pchannel = channel;
1195#ifdef RATELIMIT_DEBUG
1196	mlx5_en_info(rlw->priv->ifp,
1197	    "Channel pointer for rate limit connection is %p\n", channel);
1198#endif
1199	return (retval);
1200}
1201
1202int
1203mlx5e_rl_snd_tag_alloc(if_t ifp,
1204    union if_snd_tag_alloc_params *params,
1205    struct m_snd_tag **ppmt)
1206{
1207	struct mlx5e_rl_channel *channel;
1208	struct mlx5e_rl_worker *rlw;
1209	struct mlx5e_priv *priv;
1210	int error;
1211
1212	priv = if_getsoftc(ifp);
1213
1214	/* check if there is support for packet pacing or if device is going away */
1215	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1216	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1217	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1218		return (EOPNOTSUPP);
1219
1220	/* compute worker thread this TCP connection belongs to */
1221	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1222	    priv->rl.param.tx_worker_threads_def);
1223
1224	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1225	if (error != 0)
1226		goto done;
1227
1228	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1229	if (error != 0) {
1230		mlx5e_rl_free(rlw, channel);
1231		goto done;
1232	}
1233
1234	/* store pointer to mbuf tag */
1235	MPASS(channel->tag.refcount == 0);
1236	m_snd_tag_init(&channel->tag, ifp, &mlx5e_rl_snd_tag_sw);
1237	*ppmt = &channel->tag;
1238done:
1239	return (error);
1240}
1241
1242
1243static int
1244mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1245{
1246	struct mlx5e_rl_channel *channel =
1247	    container_of(pmt, struct mlx5e_rl_channel, tag);
1248
1249	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1250}
1251
1252static int
1253mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1254{
1255	struct mlx5e_rl_channel *channel =
1256	    container_of(pmt, struct mlx5e_rl_channel, tag);
1257
1258	return (mlx5e_rl_query(channel->worker, channel, params));
1259}
1260
1261static void
1262mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1263{
1264	struct mlx5e_rl_channel *channel =
1265	    container_of(pmt, struct mlx5e_rl_channel, tag);
1266
1267	mlx5e_rl_free(channel->worker, channel);
1268}
1269
1270static int
1271mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1272{
1273	struct mlx5e_rl_priv_data *rl = arg1;
1274	struct mlx5e_priv *priv = rl->priv;
1275	struct sbuf sbuf;
1276	unsigned x;
1277	int error;
1278
1279	error = sysctl_wire_old_buffer(req, 0);
1280	if (error != 0)
1281		return (error);
1282
1283	PRIV_LOCK(priv);
1284
1285	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1286
1287	sbuf_printf(&sbuf,
1288	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1289	    "\t" "--------------------------------------------\n");
1290
1291	MLX5E_RL_RLOCK(rl);
1292	for (x = 0; x != rl->param.tx_rates_def; x++) {
1293		if (rl->rate_limit_table[x] == 0)
1294			continue;
1295
1296		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1297		    x, (unsigned)rl->param.tx_burst_size,
1298		    (long long)rl->rate_limit_table[x]);
1299	}
1300	MLX5E_RL_RUNLOCK(rl);
1301
1302	error = sbuf_finish(&sbuf);
1303	sbuf_delete(&sbuf);
1304
1305	PRIV_UNLOCK(priv);
1306
1307	return (error);
1308}
1309
1310static int
1311mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1312{
1313	uint64_t x;
1314	uint64_t y;
1315
1316	MLX5E_RL_WLOCK(rl);
1317	/* compute channel parameters once */
1318	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1319	MLX5E_RL_WUNLOCK(rl);
1320
1321	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1322		struct mlx5e_rl_worker *rlw = rl->workers + y;
1323
1324		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1325			struct mlx5e_rl_channel *channel;
1326			struct mlx5e_sq *sq;
1327
1328			channel = rlw->channels + x;
1329			sq = channel->sq;
1330
1331			if (sq == NULL)
1332				continue;
1333
1334			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1335				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1336				    rl->param.tx_coalesce_usecs,
1337				    rl->param.tx_coalesce_pkts,
1338				    rl->param.tx_coalesce_mode);
1339			} else {
1340				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1341				    rl->param.tx_coalesce_usecs,
1342				    rl->param.tx_coalesce_pkts);
1343			}
1344		}
1345	}
1346	return (0);
1347}
1348
1349void
1350mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1351{
1352	uint64_t x;
1353	uint64_t y;
1354
1355	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1356		struct mlx5e_rl_worker *rlw = rl->workers + y;
1357
1358		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1359			struct mlx5e_rl_channel *channel;
1360			struct mlx5e_sq *sq;
1361
1362			channel = rlw->channels + x;
1363			sq = channel->sq;
1364
1365			if (sq == NULL)
1366				continue;
1367
1368			mtx_lock(&sq->lock);
1369			mlx5e_update_sq_inline(sq);
1370			mtx_unlock(&sq->lock);
1371		}
1372	}
1373}
1374
1375static int
1376mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1377{
1378	unsigned x;
1379	int error;
1380
1381	if (value < 1000 ||
1382	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1383		return (EINVAL);
1384
1385	MLX5E_RL_WLOCK(rl);
1386	error = ENOMEM;
1387
1388	/* check if rate already exists */
1389	for (x = 0; x != rl->param.tx_rates_def; x++) {
1390		if (rl->rate_limit_table[x] != value)
1391			continue;
1392		error = EEXIST;
1393		break;
1394	}
1395
1396	/* check if there is a free rate entry */
1397	if (x == rl->param.tx_rates_def) {
1398		for (x = 0; x != rl->param.tx_rates_def; x++) {
1399			if (rl->rate_limit_table[x] != 0)
1400				continue;
1401			rl->rate_limit_table[x] = value;
1402			error = 0;
1403			break;
1404		}
1405	}
1406	MLX5E_RL_WUNLOCK(rl);
1407
1408	return (error);
1409}
1410
1411static int
1412mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1413{
1414	unsigned x;
1415	int error;
1416
1417	if (value == 0)
1418		return (EINVAL);
1419
1420	MLX5E_RL_WLOCK(rl);
1421
1422	/* check if rate already exists */
1423	for (x = 0; x != rl->param.tx_rates_def; x++) {
1424		if (rl->rate_limit_table[x] != value)
1425			continue;
1426		/* free up rate */
1427		rl->rate_limit_table[x] = 0;
1428		break;
1429	}
1430
1431	/* check if there is a free rate entry */
1432	if (x == rl->param.tx_rates_def)
1433		error = ENOENT;
1434	else
1435		error = 0;
1436	MLX5E_RL_WUNLOCK(rl);
1437
1438	return (error);
1439}
1440
1441static int
1442mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1443{
1444	struct mlx5e_rl_priv_data *rl = arg1;
1445	struct mlx5e_priv *priv = rl->priv;
1446	unsigned mode_modify;
1447	unsigned was_opened;
1448	uint64_t value;
1449	int error;
1450
1451	PRIV_LOCK(priv);
1452
1453	MLX5E_RL_RLOCK(rl);
1454	value = rl->param.arg[arg2];
1455	MLX5E_RL_RUNLOCK(rl);
1456
1457	if (req != NULL) {
1458		error = sysctl_handle_64(oidp, &value, 0, req);
1459		if (error || req->newptr == NULL ||
1460		    value == rl->param.arg[arg2])
1461			goto done;
1462	} else {
1463		error = 0;
1464	}
1465
1466	/* check if device is gone */
1467	if (priv->gone) {
1468		error = ENXIO;
1469		goto done;
1470	}
1471	was_opened = rl->opened;
1472	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1473
1474	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1475	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1476		if (value > rl->param.tx_worker_threads_max)
1477			value = rl->param.tx_worker_threads_max;
1478		else if (value < 1)
1479			value = 1;
1480
1481		/* store new value */
1482		rl->param.arg[arg2] = value;
1483		break;
1484
1485	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1486		if (value > rl->param.tx_channels_per_worker_max)
1487			value = rl->param.tx_channels_per_worker_max;
1488		else if (value < 1)
1489			value = 1;
1490
1491		/* store new value */
1492		rl->param.arg[arg2] = value;
1493		break;
1494
1495	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1496		if (value > rl->param.tx_rates_max)
1497			value = rl->param.tx_rates_max;
1498		else if (value < 1)
1499			value = 1;
1500
1501		/* store new value */
1502		rl->param.arg[arg2] = value;
1503		break;
1504
1505	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1506		/* range check */
1507		if (value < 1)
1508			value = 0;
1509		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1510			value = MLX5E_FLD_MAX(cqc, cq_period);
1511
1512		/* store new value */
1513		rl->param.arg[arg2] = value;
1514
1515		/* check to avoid down and up the network interface */
1516		if (was_opened)
1517			error = mlx5e_rl_refresh_channel_params(rl);
1518		break;
1519
1520	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1521		/* import TX coal pkts */
1522		if (value < 1)
1523			value = 0;
1524		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1525			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1526
1527		/* store new value */
1528		rl->param.arg[arg2] = value;
1529
1530		/* check to avoid down and up the network interface */
1531		if (was_opened)
1532			error = mlx5e_rl_refresh_channel_params(rl);
1533		break;
1534
1535	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1536		/* network interface must be down */
1537		if (was_opened != 0 && mode_modify == 0)
1538			mlx5e_rl_close_workers(priv);
1539
1540		/* import TX coalesce mode */
1541		if (value != 0)
1542			value = 1;
1543
1544		/* store new value */
1545		rl->param.arg[arg2] = value;
1546
1547		/* restart network interface, if any */
1548		if (was_opened != 0) {
1549			if (mode_modify == 0)
1550				mlx5e_rl_open_workers(priv);
1551			else
1552				error = mlx5e_rl_refresh_channel_params(rl);
1553		}
1554		break;
1555
1556	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1557		/* network interface must be down */
1558		if (was_opened)
1559			mlx5e_rl_close_workers(priv);
1560
1561		/* import TX queue size */
1562		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1563			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1564		else if (value > priv->params_ethtool.tx_queue_size_max)
1565			value = priv->params_ethtool.tx_queue_size_max;
1566
1567		/* store actual TX queue size */
1568		value = 1ULL << order_base_2(value);
1569
1570		/* store new value */
1571		rl->param.arg[arg2] = value;
1572
1573		/* verify TX completion factor */
1574		mlx5e_rl_sync_tx_completion_fact(rl);
1575
1576		/* restart network interface, if any */
1577		if (was_opened)
1578			mlx5e_rl_open_workers(priv);
1579		break;
1580
1581	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1582		/* network interface must be down */
1583		if (was_opened)
1584			mlx5e_rl_close_workers(priv);
1585
1586		/* store new value */
1587		rl->param.arg[arg2] = value;
1588
1589		/* verify parameter */
1590		mlx5e_rl_sync_tx_completion_fact(rl);
1591
1592		/* restart network interface, if any */
1593		if (was_opened)
1594			mlx5e_rl_open_workers(priv);
1595		break;
1596
1597	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1598		error = mlx5e_rl_tx_limit_add(rl, value);
1599		break;
1600
1601	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1602		error = mlx5e_rl_tx_limit_clr(rl, value);
1603		break;
1604
1605	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1606		/* range check */
1607		if (value > rl->param.tx_allowed_deviation_max)
1608			value = rl->param.tx_allowed_deviation_max;
1609		else if (value < rl->param.tx_allowed_deviation_min)
1610			value = rl->param.tx_allowed_deviation_min;
1611
1612		MLX5E_RL_WLOCK(rl);
1613		rl->param.arg[arg2] = value;
1614		MLX5E_RL_WUNLOCK(rl);
1615		break;
1616
1617	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1618		/* range check */
1619		if (value > rl->param.tx_burst_size_max)
1620			value = rl->param.tx_burst_size_max;
1621		else if (value < rl->param.tx_burst_size_min)
1622			value = rl->param.tx_burst_size_min;
1623
1624		MLX5E_RL_WLOCK(rl);
1625		rl->param.arg[arg2] = value;
1626		MLX5E_RL_WUNLOCK(rl);
1627		break;
1628
1629	default:
1630		break;
1631	}
1632done:
1633	PRIV_UNLOCK(priv);
1634	return (error);
1635}
1636
1637static void
1638mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1639    struct sysctl_oid *node, const char *name, const char *desc)
1640{
1641	/*
1642	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1643	 * take care of loading default sysctl value from the kernel
1644	 * environment, if any:
1645	 */
1646	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1647		/* read-only SYSCTLs */
1648		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1649		    name, CTLTYPE_U64 | CTLFLAG_RD |
1650		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1651	} else {
1652		if (strstr(name, "_def") != 0) {
1653#ifdef RATELIMIT_DEBUG
1654			/* tunable read-only advanced SYSCTLs */
1655			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1656			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1657			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1658#endif
1659		} else {
1660			/* read-write SYSCTLs */
1661			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1662			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1663			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1664		}
1665	}
1666}
1667
1668static void
1669mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1670    struct sysctl_oid *node, const char *name, const char *desc)
1671{
1672	/* read-only SYSCTLs */
1673	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1674	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1675}
1676
1677#else
1678
1679int
1680mlx5e_rl_init(struct mlx5e_priv *priv)
1681{
1682
1683	return (0);
1684}
1685
1686void
1687mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1688{
1689	/* NOP */
1690}
1691
1692#endif		/* RATELIMIT */
1693