1/*
2 * net/sched/sch_tbf.c	Token Bucket Filter queue.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *		Dmitry Torokhov <dtor@mail.ru> - allow attaching inner qdiscs -
11 *						 original idea by Martin Devera
12 *
13 */
14
15#include <linux/module.h>
16#include <asm/uaccess.h>
17#include <asm/system.h>
18#include <linux/bitops.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/jiffies.h>
22#include <linux/string.h>
23#include <linux/mm.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/in.h>
27#include <linux/errno.h>
28#include <linux/interrupt.h>
29#include <linux/if_ether.h>
30#include <linux/inet.h>
31#include <linux/netdevice.h>
32#include <linux/etherdevice.h>
33#include <linux/notifier.h>
34#include <net/ip.h>
35#include <net/netlink.h>
36#include <net/route.h>
37#include <linux/skbuff.h>
38#include <net/sock.h>
39#include <net/pkt_sched.h>
40
41
42/*	Simple Token Bucket Filter.
43	=======================================
44
45	SOURCE.
46	-------
47
48	None.
49
50	Description.
51	------------
52
53	A data flow obeys TBF with rate R and depth B, if for any
54	time interval t_i...t_f the number of transmitted bits
55	does not exceed B + R*(t_f-t_i).
56
57	Packetized version of this definition:
58	The sequence of packets of sizes s_i served at moments t_i
59	obeys TBF, if for any i<=k:
60
61	s_i+....+s_k <= B + R*(t_k - t_i)
62
63	Algorithm.
64	----------
65
66	Let N(t_i) be B/R initially and N(t) grow continuously with time as:
67
68	N(t+delta) = min{B/R, N(t) + delta}
69
70	If the first packet in queue has length S, it may be
71	transmitted only at the time t_* when S/R <= N(t_*),
72	and in this case N(t) jumps:
73
74	N(t_* + 0) = N(t_* - 0) - S/R.
75
76
77
78	Actually, QoS requires two TBF to be applied to a data stream.
79	One of them controls steady state burst size, another
80	one with rate P (peak rate) and depth M (equal to link MTU)
81	limits bursts at a smaller time scale.
82
83	It is easy to see that P>R, and B>M. If P is infinity, this double
84	TBF is equivalent to a single one.
85
86	When TBF works in reshaping mode, latency is estimated as:
87
88	lat = max ((L-B)/R, (L-M)/P)
89
90
91	NOTES.
92	------
93
94	If TBF throttles, it starts a watchdog timer, which will wake it up
95	when it is ready to transmit.
96	Note that the minimal timer resolution is 1/HZ.
97	If no new packets arrive during this period,
98	or if the device is not awaken by EOI for some previous packet,
99	TBF can stop its activity for 1/HZ.
100
101
102	This means, that with depth B, the maximal rate is
103
104	R_crit = B*HZ
105
106	F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes.
107
108	Note that the peak rate TBF is much more tough: with MTU 1500
109	P_crit = 150Kbytes/sec. So, if you need greater peak
110	rates, use alpha with HZ=1000 :-)
111
112	With classful TBF, limit is just kept for backwards compatibility.
113	It is passed to the default bfifo qdisc - if the inner qdisc is
114	changed the limit is not effective anymore.
115*/
116
117struct tbf_sched_data
118{
119/* Parameters */
120	u32		limit;		/* Maximal length of backlog: bytes */
121	u32		buffer;		/* Token bucket depth/rate: MUST BE >= MTU/B */
122	u32		mtu;
123	u32		max_size;
124	struct qdisc_rate_table	*R_tab;
125	struct qdisc_rate_table	*P_tab;
126
127/* Variables */
128	long	tokens;			/* Current number of B tokens */
129	long	ptokens;		/* Current number of P tokens */
130	psched_time_t	t_c;		/* Time check-point */
131	struct Qdisc	*qdisc;		/* Inner qdisc, default - bfifo queue */
132	struct qdisc_watchdog watchdog;	/* Watchdog timer */
133};
134
135#define L2T(q,L)   ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log])
136#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log])
137
138static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch)
139{
140	struct tbf_sched_data *q = qdisc_priv(sch);
141	int ret;
142
143	if (skb->len > q->max_size) {
144		sch->qstats.drops++;
145#ifdef CONFIG_NET_CLS_POLICE
146		if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch))
147#endif
148			kfree_skb(skb);
149
150		return NET_XMIT_DROP;
151	}
152
153	if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) {
154		sch->qstats.drops++;
155		return ret;
156	}
157
158	sch->q.qlen++;
159	sch->bstats.bytes += skb->len;
160	sch->bstats.packets++;
161	return 0;
162}
163
164static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch)
165{
166	struct tbf_sched_data *q = qdisc_priv(sch);
167	int ret;
168
169	if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) {
170		sch->q.qlen++;
171		sch->qstats.requeues++;
172	}
173
174	return ret;
175}
176
177static unsigned int tbf_drop(struct Qdisc* sch)
178{
179	struct tbf_sched_data *q = qdisc_priv(sch);
180	unsigned int len = 0;
181
182	if (q->qdisc->ops->drop && (len = q->qdisc->ops->drop(q->qdisc)) != 0) {
183		sch->q.qlen--;
184		sch->qstats.drops++;
185	}
186	return len;
187}
188
189static struct sk_buff *tbf_dequeue(struct Qdisc* sch)
190{
191	struct tbf_sched_data *q = qdisc_priv(sch);
192	struct sk_buff *skb;
193
194	skb = q->qdisc->dequeue(q->qdisc);
195
196	if (skb) {
197		psched_time_t now;
198		long toks;
199		long ptoks = 0;
200		unsigned int len = skb->len;
201
202		now = psched_get_time();
203		toks = psched_tdiff_bounded(now, q->t_c, q->buffer);
204
205		if (q->P_tab) {
206			ptoks = toks + q->ptokens;
207			if (ptoks > (long)q->mtu)
208				ptoks = q->mtu;
209			ptoks -= L2T_P(q, len);
210		}
211		toks += q->tokens;
212		if (toks > (long)q->buffer)
213			toks = q->buffer;
214		toks -= L2T(q, len);
215
216		if ((toks|ptoks) >= 0) {
217			q->t_c = now;
218			q->tokens = toks;
219			q->ptokens = ptoks;
220			sch->q.qlen--;
221			sch->flags &= ~TCQ_F_THROTTLED;
222			return skb;
223		}
224
225		qdisc_watchdog_schedule(&q->watchdog,
226					now + max_t(long, -toks, -ptoks));
227
228		/* Maybe we have a shorter packet in the queue,
229		   which can be sent now. It sounds cool,
230		   but, however, this is wrong in principle.
231		   We MUST NOT reorder packets under these circumstances.
232
233		   Really, if we split the flow into independent
234		   subflows, it would be a very good solution.
235		   This is the main idea of all FQ algorithms
236		   (cf. CSZ, HPFQ, HFSC)
237		 */
238
239		if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) {
240			/* When requeue fails skb is dropped */
241			qdisc_tree_decrease_qlen(q->qdisc, 1);
242			sch->qstats.drops++;
243		}
244
245		sch->qstats.overlimits++;
246	}
247	return NULL;
248}
249
250static void tbf_reset(struct Qdisc* sch)
251{
252	struct tbf_sched_data *q = qdisc_priv(sch);
253
254	qdisc_reset(q->qdisc);
255	sch->q.qlen = 0;
256	q->t_c = psched_get_time();
257	q->tokens = q->buffer;
258	q->ptokens = q->mtu;
259	qdisc_watchdog_cancel(&q->watchdog);
260}
261
262static struct Qdisc *tbf_create_dflt_qdisc(struct Qdisc *sch, u32 limit)
263{
264	struct Qdisc *q;
265	struct rtattr *rta;
266	int ret;
267
268	q = qdisc_create_dflt(sch->dev, &bfifo_qdisc_ops,
269			      TC_H_MAKE(sch->handle, 1));
270	if (q) {
271		rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL);
272		if (rta) {
273			rta->rta_type = RTM_NEWQDISC;
274			rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt));
275			((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit;
276
277			ret = q->ops->change(q, rta);
278			kfree(rta);
279
280			if (ret == 0)
281				return q;
282		}
283		qdisc_destroy(q);
284	}
285
286	return NULL;
287}
288
289static int tbf_change(struct Qdisc* sch, struct rtattr *opt)
290{
291	int err = -EINVAL;
292	struct tbf_sched_data *q = qdisc_priv(sch);
293	struct rtattr *tb[TCA_TBF_PTAB];
294	struct tc_tbf_qopt *qopt;
295	struct qdisc_rate_table *rtab = NULL;
296	struct qdisc_rate_table *ptab = NULL;
297	struct Qdisc *child = NULL;
298	int max_size,n;
299
300	if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) ||
301	    tb[TCA_TBF_PARMS-1] == NULL ||
302	    RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt))
303		goto done;
304
305	qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]);
306	rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]);
307	if (rtab == NULL)
308		goto done;
309
310	if (qopt->peakrate.rate) {
311		if (qopt->peakrate.rate > qopt->rate.rate)
312			ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]);
313		if (ptab == NULL)
314			goto done;
315	}
316
317	for (n = 0; n < 256; n++)
318		if (rtab->data[n] > qopt->buffer) break;
319	max_size = (n << qopt->rate.cell_log)-1;
320	if (ptab) {
321		int size;
322
323		for (n = 0; n < 256; n++)
324			if (ptab->data[n] > qopt->mtu) break;
325		size = (n << qopt->peakrate.cell_log)-1;
326		if (size < max_size) max_size = size;
327	}
328	if (max_size < 0)
329		goto done;
330
331	if (qopt->limit > 0) {
332		if ((child = tbf_create_dflt_qdisc(sch, qopt->limit)) == NULL)
333			goto done;
334	}
335
336	sch_tree_lock(sch);
337	if (child) {
338		qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
339		qdisc_destroy(xchg(&q->qdisc, child));
340	}
341	q->limit = qopt->limit;
342	q->mtu = qopt->mtu;
343	q->max_size = max_size;
344	q->buffer = qopt->buffer;
345	q->tokens = q->buffer;
346	q->ptokens = q->mtu;
347	rtab = xchg(&q->R_tab, rtab);
348	ptab = xchg(&q->P_tab, ptab);
349	sch_tree_unlock(sch);
350	err = 0;
351done:
352	if (rtab)
353		qdisc_put_rtab(rtab);
354	if (ptab)
355		qdisc_put_rtab(ptab);
356	return err;
357}
358
359static int tbf_init(struct Qdisc* sch, struct rtattr *opt)
360{
361	struct tbf_sched_data *q = qdisc_priv(sch);
362
363	if (opt == NULL)
364		return -EINVAL;
365
366	q->t_c = psched_get_time();
367	qdisc_watchdog_init(&q->watchdog, sch);
368	q->qdisc = &noop_qdisc;
369
370	return tbf_change(sch, opt);
371}
372
373static void tbf_destroy(struct Qdisc *sch)
374{
375	struct tbf_sched_data *q = qdisc_priv(sch);
376
377	qdisc_watchdog_cancel(&q->watchdog);
378
379	if (q->P_tab)
380		qdisc_put_rtab(q->P_tab);
381	if (q->R_tab)
382		qdisc_put_rtab(q->R_tab);
383
384	qdisc_destroy(q->qdisc);
385}
386
387static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb)
388{
389	struct tbf_sched_data *q = qdisc_priv(sch);
390	unsigned char *b = skb_tail_pointer(skb);
391	struct rtattr *rta;
392	struct tc_tbf_qopt opt;
393
394	rta = (struct rtattr*)b;
395	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
396
397	opt.limit = q->limit;
398	opt.rate = q->R_tab->rate;
399	if (q->P_tab)
400		opt.peakrate = q->P_tab->rate;
401	else
402		memset(&opt.peakrate, 0, sizeof(opt.peakrate));
403	opt.mtu = q->mtu;
404	opt.buffer = q->buffer;
405	RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt);
406	rta->rta_len = skb_tail_pointer(skb) - b;
407
408	return skb->len;
409
410rtattr_failure:
411	nlmsg_trim(skb, b);
412	return -1;
413}
414
415static int tbf_dump_class(struct Qdisc *sch, unsigned long cl,
416			  struct sk_buff *skb, struct tcmsg *tcm)
417{
418	struct tbf_sched_data *q = qdisc_priv(sch);
419
420	if (cl != 1) 	/* only one class */
421		return -ENOENT;
422
423	tcm->tcm_handle |= TC_H_MIN(1);
424	tcm->tcm_info = q->qdisc->handle;
425
426	return 0;
427}
428
429static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
430		     struct Qdisc **old)
431{
432	struct tbf_sched_data *q = qdisc_priv(sch);
433
434	if (new == NULL)
435		new = &noop_qdisc;
436
437	sch_tree_lock(sch);
438	*old = xchg(&q->qdisc, new);
439	qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
440	qdisc_reset(*old);
441	sch_tree_unlock(sch);
442
443	return 0;
444}
445
446static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg)
447{
448	struct tbf_sched_data *q = qdisc_priv(sch);
449	return q->qdisc;
450}
451
452static unsigned long tbf_get(struct Qdisc *sch, u32 classid)
453{
454	return 1;
455}
456
457static void tbf_put(struct Qdisc *sch, unsigned long arg)
458{
459}
460
461static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
462			    struct rtattr **tca, unsigned long *arg)
463{
464	return -ENOSYS;
465}
466
467static int tbf_delete(struct Qdisc *sch, unsigned long arg)
468{
469	return -ENOSYS;
470}
471
472static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker)
473{
474	if (!walker->stop) {
475		if (walker->count >= walker->skip)
476			if (walker->fn(sch, 1, walker) < 0) {
477				walker->stop = 1;
478				return;
479			}
480		walker->count++;
481	}
482}
483
484static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl)
485{
486	return NULL;
487}
488
489static struct Qdisc_class_ops tbf_class_ops =
490{
491	.graft		=	tbf_graft,
492	.leaf		=	tbf_leaf,
493	.get		=	tbf_get,
494	.put		=	tbf_put,
495	.change		=	tbf_change_class,
496	.delete		=	tbf_delete,
497	.walk		=	tbf_walk,
498	.tcf_chain	=	tbf_find_tcf,
499	.dump		=	tbf_dump_class,
500};
501
502static struct Qdisc_ops tbf_qdisc_ops = {
503	.next		=	NULL,
504	.cl_ops		=	&tbf_class_ops,
505	.id		=	"tbf",
506	.priv_size	=	sizeof(struct tbf_sched_data),
507	.enqueue	=	tbf_enqueue,
508	.dequeue	=	tbf_dequeue,
509	.requeue	=	tbf_requeue,
510	.drop		=	tbf_drop,
511	.init		=	tbf_init,
512	.reset		=	tbf_reset,
513	.destroy	=	tbf_destroy,
514	.change		=	tbf_change,
515	.dump		=	tbf_dump,
516	.owner		=	THIS_MODULE,
517};
518
519static int __init tbf_module_init(void)
520{
521	return register_qdisc(&tbf_qdisc_ops);
522}
523
524static void __exit tbf_module_exit(void)
525{
526	unregister_qdisc(&tbf_qdisc_ops);
527}
528module_init(tbf_module_init)
529module_exit(tbf_module_exit)
530MODULE_LICENSE("GPL");
531