• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6.36/net/sched/
1/*
2 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12#include <linux/module.h>
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/string.h>
17#include <linux/in.h>
18#include <linux/errno.h>
19#include <linux/init.h>
20#include <linux/ipv6.h>
21#include <linux/skbuff.h>
22#include <linux/jhash.h>
23#include <linux/slab.h>
24#include <net/ip.h>
25#include <net/netlink.h>
26#include <net/pkt_sched.h>
27
28
29/*	Stochastic Fairness Queuing algorithm.
30	=======================================
31
32	Source:
33	Paul E. McKenney "Stochastic Fairness Queuing",
34	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
35
36	Paul E. McKenney "Stochastic Fairness Queuing",
37	"Interworking: Research and Experience", v.2, 1991, p.113-131.
38
39
40	See also:
41	M. Shreedhar and George Varghese "Efficient Fair
42	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
43
44
45	This is not the thing that is usually called (W)FQ nowadays.
46	It does not use any timestamp mechanism, but instead
47	processes queues in round-robin order.
48
49	ADVANTAGE:
50
51	- It is very cheap. Both CPU and memory requirements are minimal.
52
53	DRAWBACKS:
54
55	- "Stochastic" -> It is not 100% fair.
56	When hash collisions occur, several flows are considered as one.
57
58	- "Round-robin" -> It introduces larger delays than virtual clock
59	based schemes, and should not be used for isolating interactive
60	traffic	from non-interactive. It means, that this scheduler
61	should be used as leaf of CBQ or P3, which put interactive traffic
62	to higher priority band.
63
64	We still need true WFQ for top level CSZ, but using WFQ
65	for the best effort traffic is absolutely pointless:
66	SFQ is superior for this purpose.
67
68	IMPLEMENTATION:
69	This implementation limits maximal queue length to 128;
70	maximal mtu to 2^15-1; number of hash buckets to 1024.
71	The only goal of this restrictions was that all data
72	fit into one 4K page :-). Struct sfq_sched_data is
73	organized in anti-cache manner: all the data for a bucket
74	are scattered over different locations. This is not good,
75	but it allowed me to put it into 4K.
76
77	It is easy to increase these values, but not in flight.  */
78
79#define SFQ_DEPTH		128
80#define SFQ_HASH_DIVISOR	1024
81
82/* This type should contain at least SFQ_DEPTH*2 values */
83typedef unsigned char sfq_index;
84
85struct sfq_head
86{
87	sfq_index	next;
88	sfq_index	prev;
89};
90
91struct sfq_sched_data
92{
93/* Parameters */
94	int		perturb_period;
95	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
96	int		limit;
97
98/* Variables */
99	struct tcf_proto *filter_list;
100	struct timer_list perturb_timer;
101	u32		perturbation;
102	sfq_index	tail;		/* Index of current slot in round */
103	sfq_index	max_depth;	/* Maximal depth */
104
105	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
106	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
107	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
108	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
109	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
110	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
111};
112
113static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
114{
115	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
116}
117
118static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
119{
120	u32 h, h2;
121
122	switch (skb->protocol) {
123	case htons(ETH_P_IP):
124	{
125		const struct iphdr *iph;
126
127		if (!pskb_network_may_pull(skb, sizeof(*iph)))
128			goto err;
129		iph = ip_hdr(skb);
130		h = (__force u32)iph->daddr;
131		h2 = (__force u32)iph->saddr ^ iph->protocol;
132		if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
133		    (iph->protocol == IPPROTO_TCP ||
134		     iph->protocol == IPPROTO_UDP ||
135		     iph->protocol == IPPROTO_UDPLITE ||
136		     iph->protocol == IPPROTO_SCTP ||
137		     iph->protocol == IPPROTO_DCCP ||
138		     iph->protocol == IPPROTO_ESP) &&
139		     pskb_network_may_pull(skb, iph->ihl * 4 + 4))
140			h2 ^= *(((u32*)iph) + iph->ihl);
141		break;
142	}
143	case htons(ETH_P_IPV6):
144	{
145		struct ipv6hdr *iph;
146
147		if (!pskb_network_may_pull(skb, sizeof(*iph)))
148			goto err;
149		iph = ipv6_hdr(skb);
150		h = (__force u32)iph->daddr.s6_addr32[3];
151		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
152		if ((iph->nexthdr == IPPROTO_TCP ||
153		     iph->nexthdr == IPPROTO_UDP ||
154		     iph->nexthdr == IPPROTO_UDPLITE ||
155		     iph->nexthdr == IPPROTO_SCTP ||
156		     iph->nexthdr == IPPROTO_DCCP ||
157		     iph->nexthdr == IPPROTO_ESP) &&
158		    pskb_network_may_pull(skb, sizeof(*iph) + 4))
159			h2 ^= *(u32*)&iph[1];
160		break;
161	}
162	default:
163err:
164		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
165		h2 = (unsigned long)skb->sk;
166	}
167
168	return sfq_fold_hash(q, h, h2);
169}
170
171static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
172				 int *qerr)
173{
174	struct sfq_sched_data *q = qdisc_priv(sch);
175	struct tcf_result res;
176	int result;
177
178	if (TC_H_MAJ(skb->priority) == sch->handle &&
179	    TC_H_MIN(skb->priority) > 0 &&
180	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
181		return TC_H_MIN(skb->priority);
182
183	if (!q->filter_list)
184		return sfq_hash(q, skb) + 1;
185
186	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
187	result = tc_classify(skb, q->filter_list, &res);
188	if (result >= 0) {
189#ifdef CONFIG_NET_CLS_ACT
190		switch (result) {
191		case TC_ACT_STOLEN:
192		case TC_ACT_QUEUED:
193			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
194		case TC_ACT_SHOT:
195			return 0;
196		}
197#endif
198		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
199			return TC_H_MIN(res.classid);
200	}
201	return 0;
202}
203
204static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
205{
206	sfq_index p, n;
207	int d = q->qs[x].qlen + SFQ_DEPTH;
208
209	p = d;
210	n = q->dep[d].next;
211	q->dep[x].next = n;
212	q->dep[x].prev = p;
213	q->dep[p].next = q->dep[n].prev = x;
214}
215
216static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
217{
218	sfq_index p, n;
219
220	n = q->dep[x].next;
221	p = q->dep[x].prev;
222	q->dep[p].next = n;
223	q->dep[n].prev = p;
224
225	if (n == p && q->max_depth == q->qs[x].qlen + 1)
226		q->max_depth--;
227
228	sfq_link(q, x);
229}
230
231static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
232{
233	sfq_index p, n;
234	int d;
235
236	n = q->dep[x].next;
237	p = q->dep[x].prev;
238	q->dep[p].next = n;
239	q->dep[n].prev = p;
240	d = q->qs[x].qlen;
241	if (q->max_depth < d)
242		q->max_depth = d;
243
244	sfq_link(q, x);
245}
246
247static unsigned int sfq_drop(struct Qdisc *sch)
248{
249	struct sfq_sched_data *q = qdisc_priv(sch);
250	sfq_index d = q->max_depth;
251	struct sk_buff *skb;
252	unsigned int len;
253
254	/* Queue is full! Find the longest slot and
255	   drop a packet from it */
256
257	if (d > 1) {
258		sfq_index x = q->dep[d + SFQ_DEPTH].next;
259		skb = q->qs[x].prev;
260		len = qdisc_pkt_len(skb);
261		__skb_unlink(skb, &q->qs[x]);
262		kfree_skb(skb);
263		sfq_dec(q, x);
264		sch->q.qlen--;
265		sch->qstats.drops++;
266		sch->qstats.backlog -= len;
267		return len;
268	}
269
270	if (d == 1) {
271		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
272		d = q->next[q->tail];
273		q->next[q->tail] = q->next[d];
274		q->allot[q->next[d]] += q->quantum;
275		skb = q->qs[d].prev;
276		len = qdisc_pkt_len(skb);
277		__skb_unlink(skb, &q->qs[d]);
278		kfree_skb(skb);
279		sfq_dec(q, d);
280		sch->q.qlen--;
281		q->ht[q->hash[d]] = SFQ_DEPTH;
282		sch->qstats.drops++;
283		sch->qstats.backlog -= len;
284		return len;
285	}
286
287	return 0;
288}
289
290static int
291sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
292{
293	struct sfq_sched_data *q = qdisc_priv(sch);
294	unsigned int hash;
295	sfq_index x;
296	int uninitialized_var(ret);
297
298	hash = sfq_classify(skb, sch, &ret);
299	if (hash == 0) {
300		if (ret & __NET_XMIT_BYPASS)
301			sch->qstats.drops++;
302		kfree_skb(skb);
303		return ret;
304	}
305	hash--;
306
307	x = q->ht[hash];
308	if (x == SFQ_DEPTH) {
309		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
310		q->hash[x] = hash;
311	}
312
313	/* If selected queue has length q->limit, this means that
314	 * all another queues are empty and that we do simple tail drop,
315	 * i.e. drop _this_ packet.
316	 */
317	if (q->qs[x].qlen >= q->limit)
318		return qdisc_drop(skb, sch);
319
320	sch->qstats.backlog += qdisc_pkt_len(skb);
321	__skb_queue_tail(&q->qs[x], skb);
322	sfq_inc(q, x);
323	if (q->qs[x].qlen == 1) {		/* The flow is new */
324		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
325			q->tail = x;
326			q->next[x] = x;
327			q->allot[x] = q->quantum;
328		} else {
329			q->next[x] = q->next[q->tail];
330			q->next[q->tail] = x;
331			q->tail = x;
332		}
333	}
334	if (++sch->q.qlen <= q->limit) {
335		sch->bstats.bytes += qdisc_pkt_len(skb);
336		sch->bstats.packets++;
337		return NET_XMIT_SUCCESS;
338	}
339
340	sfq_drop(sch);
341	return NET_XMIT_CN;
342}
343
344static struct sk_buff *
345sfq_peek(struct Qdisc *sch)
346{
347	struct sfq_sched_data *q = qdisc_priv(sch);
348	sfq_index a;
349
350	/* No active slots */
351	if (q->tail == SFQ_DEPTH)
352		return NULL;
353
354	a = q->next[q->tail];
355	return skb_peek(&q->qs[a]);
356}
357
358static struct sk_buff *
359sfq_dequeue(struct Qdisc *sch)
360{
361	struct sfq_sched_data *q = qdisc_priv(sch);
362	struct sk_buff *skb;
363	sfq_index a, old_a;
364
365	/* No active slots */
366	if (q->tail == SFQ_DEPTH)
367		return NULL;
368
369	a = old_a = q->next[q->tail];
370
371	/* Grab packet */
372	skb = __skb_dequeue(&q->qs[a]);
373	sfq_dec(q, a);
374	sch->q.qlen--;
375	sch->qstats.backlog -= qdisc_pkt_len(skb);
376
377	/* Is the slot empty? */
378	if (q->qs[a].qlen == 0) {
379		q->ht[q->hash[a]] = SFQ_DEPTH;
380		a = q->next[a];
381		if (a == old_a) {
382			q->tail = SFQ_DEPTH;
383			return skb;
384		}
385		q->next[q->tail] = a;
386		q->allot[a] += q->quantum;
387	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
388		q->tail = a;
389		a = q->next[a];
390		q->allot[a] += q->quantum;
391	}
392	return skb;
393}
394
395static void
396sfq_reset(struct Qdisc *sch)
397{
398	struct sk_buff *skb;
399
400	while ((skb = sfq_dequeue(sch)) != NULL)
401		kfree_skb(skb);
402}
403
404static void sfq_perturbation(unsigned long arg)
405{
406	struct Qdisc *sch = (struct Qdisc *)arg;
407	struct sfq_sched_data *q = qdisc_priv(sch);
408
409	q->perturbation = net_random();
410
411	if (q->perturb_period)
412		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
413}
414
415static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
416{
417	struct sfq_sched_data *q = qdisc_priv(sch);
418	struct tc_sfq_qopt *ctl = nla_data(opt);
419	unsigned int qlen;
420
421	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
422		return -EINVAL;
423
424	sch_tree_lock(sch);
425	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
426	q->perturb_period = ctl->perturb_period * HZ;
427	if (ctl->limit)
428		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
429
430	qlen = sch->q.qlen;
431	while (sch->q.qlen > q->limit)
432		sfq_drop(sch);
433	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
434
435	del_timer(&q->perturb_timer);
436	if (q->perturb_period) {
437		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
438		q->perturbation = net_random();
439	}
440	sch_tree_unlock(sch);
441	return 0;
442}
443
444static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
445{
446	struct sfq_sched_data *q = qdisc_priv(sch);
447	int i;
448
449	q->perturb_timer.function = sfq_perturbation;
450	q->perturb_timer.data = (unsigned long)sch;
451	init_timer_deferrable(&q->perturb_timer);
452
453	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
454		q->ht[i] = SFQ_DEPTH;
455
456	for (i = 0; i < SFQ_DEPTH; i++) {
457		skb_queue_head_init(&q->qs[i]);
458		q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
459		q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
460	}
461
462	q->limit = SFQ_DEPTH - 1;
463	q->max_depth = 0;
464	q->tail = SFQ_DEPTH;
465	if (opt == NULL) {
466		q->quantum = psched_mtu(qdisc_dev(sch));
467		q->perturb_period = 0;
468		q->perturbation = net_random();
469	} else {
470		int err = sfq_change(sch, opt);
471		if (err)
472			return err;
473	}
474
475	for (i = 0; i < SFQ_DEPTH; i++)
476		sfq_link(q, i);
477	return 0;
478}
479
480static void sfq_destroy(struct Qdisc *sch)
481{
482	struct sfq_sched_data *q = qdisc_priv(sch);
483
484	tcf_destroy_chain(&q->filter_list);
485	q->perturb_period = 0;
486	del_timer_sync(&q->perturb_timer);
487}
488
489static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
490{
491	struct sfq_sched_data *q = qdisc_priv(sch);
492	unsigned char *b = skb_tail_pointer(skb);
493	struct tc_sfq_qopt opt;
494
495	opt.quantum = q->quantum;
496	opt.perturb_period = q->perturb_period / HZ;
497
498	opt.limit = q->limit;
499	opt.divisor = SFQ_HASH_DIVISOR;
500	opt.flows = q->limit;
501
502	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
503
504	return skb->len;
505
506nla_put_failure:
507	nlmsg_trim(skb, b);
508	return -1;
509}
510
511static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
512{
513	return NULL;
514}
515
516static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
517{
518	return 0;
519}
520
521static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
522			      u32 classid)
523{
524	return 0;
525}
526
527static void sfq_put(struct Qdisc *q, unsigned long cl)
528{
529}
530
531static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
532{
533	struct sfq_sched_data *q = qdisc_priv(sch);
534
535	if (cl)
536		return NULL;
537	return &q->filter_list;
538}
539
540static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
541			  struct sk_buff *skb, struct tcmsg *tcm)
542{
543	tcm->tcm_handle |= TC_H_MIN(cl);
544	return 0;
545}
546
547static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
548				struct gnet_dump *d)
549{
550	struct sfq_sched_data *q = qdisc_priv(sch);
551	sfq_index idx = q->ht[cl-1];
552	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
553	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
554
555	if (gnet_stats_copy_queue(d, &qs) < 0)
556		return -1;
557	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
558}
559
560static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
561{
562	struct sfq_sched_data *q = qdisc_priv(sch);
563	unsigned int i;
564
565	if (arg->stop)
566		return;
567
568	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
569		if (q->ht[i] == SFQ_DEPTH ||
570		    arg->count < arg->skip) {
571			arg->count++;
572			continue;
573		}
574		if (arg->fn(sch, i + 1, arg) < 0) {
575			arg->stop = 1;
576			break;
577		}
578		arg->count++;
579	}
580}
581
582static const struct Qdisc_class_ops sfq_class_ops = {
583	.leaf		=	sfq_leaf,
584	.get		=	sfq_get,
585	.put		=	sfq_put,
586	.tcf_chain	=	sfq_find_tcf,
587	.bind_tcf	=	sfq_bind,
588	.unbind_tcf	=	sfq_put,
589	.dump		=	sfq_dump_class,
590	.dump_stats	=	sfq_dump_class_stats,
591	.walk		=	sfq_walk,
592};
593
594static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
595	.cl_ops		=	&sfq_class_ops,
596	.id		=	"sfq",
597	.priv_size	=	sizeof(struct sfq_sched_data),
598	.enqueue	=	sfq_enqueue,
599	.dequeue	=	sfq_dequeue,
600	.peek		=	sfq_peek,
601	.drop		=	sfq_drop,
602	.init		=	sfq_init,
603	.reset		=	sfq_reset,
604	.destroy	=	sfq_destroy,
605	.change		=	NULL,
606	.dump		=	sfq_dump,
607	.owner		=	THIS_MODULE,
608};
609
610static int __init sfq_module_init(void)
611{
612	return register_qdisc(&sfq_qdisc_ops);
613}
614static void __exit sfq_module_exit(void)
615{
616	unregister_qdisc(&sfq_qdisc_ops);
617}
618module_init(sfq_module_init)
619module_exit(sfq_module_exit)
620MODULE_LICENSE("GPL");
621