1/*
2 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 */
12
13#include <linux/module.h>
14#include <asm/uaccess.h>
15#include <asm/system.h>
16#include <linux/bitops.h>
17#include <linux/types.h>
18#include <linux/kernel.h>
19#include <linux/string.h>
20#include <linux/mm.h>
21#include <linux/socket.h>
22#include <linux/sockios.h>
23#include <linux/in.h>
24#include <linux/errno.h>
25#include <linux/interrupt.h>
26#include <linux/if_ether.h>
27#include <linux/inet.h>
28#include <linux/netdevice.h>
29#include <linux/etherdevice.h>
30#include <linux/notifier.h>
31#include <net/ip.h>
32#include <net/netlink.h>
33#include <net/route.h>
34#include <linux/skbuff.h>
35#include <net/sock.h>
36#include <net/pkt_sched.h>
37
38
39
40struct cbq_sched_data;
41
42
43struct cbq_class
44{
45	struct cbq_class	*next;		/* hash table link */
46	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
47
48/* Parameters */
49	u32			classid;
50	unsigned char		priority;	/* class priority */
51	unsigned char		priority2;	/* priority to be used after overlimit */
52	unsigned char		ewma_log;	/* time constant for idle time calculation */
53	unsigned char		ovl_strategy;
54#ifdef CONFIG_NET_CLS_POLICE
55	unsigned char		police;
56#endif
57
58	u32			defmap;
59
60	/* Link-sharing scheduler parameters */
61	long			maxidle;	/* Class parameters: see below. */
62	long			offtime;
63	long			minidle;
64	u32			avpkt;
65	struct qdisc_rate_table	*R_tab;
66
67	/* Overlimit strategy parameters */
68	void			(*overlimit)(struct cbq_class *cl);
69	psched_tdiff_t		penalty;
70
71	/* General scheduler (WRR) parameters */
72	long			allot;
73	long			quantum;	/* Allotment per WRR round */
74	long			weight;		/* Relative allotment: see below */
75
76	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
77	struct cbq_class	*split;		/* Ptr to split node */
78	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
79	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
80	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
81						   parent otherwise */
82	struct cbq_class	*sibling;	/* Sibling chain */
83	struct cbq_class	*children;	/* Pointer to children chain */
84
85	struct Qdisc		*q;		/* Elementary queueing discipline */
86
87
88/* Variables */
89	unsigned char		cpriority;	/* Effective priority */
90	unsigned char		delayed;
91	unsigned char		level;		/* level of the class in hierarchy:
92						   0 for leaf classes, and maximal
93						   level of children + 1 for nodes.
94						 */
95
96	psched_time_t		last;		/* Last end of service */
97	psched_time_t		undertime;
98	long			avgidle;
99	long			deficit;	/* Saved deficit for WRR */
100	psched_time_t		penalized;
101	struct gnet_stats_basic bstats;
102	struct gnet_stats_queue qstats;
103	struct gnet_stats_rate_est rate_est;
104	spinlock_t		*stats_lock;
105	struct tc_cbq_xstats	xstats;
106
107	struct tcf_proto	*filter_list;
108
109	int			refcnt;
110	int			filters;
111
112	struct cbq_class 	*defaults[TC_PRIO_MAX+1];
113};
114
115struct cbq_sched_data
116{
117	struct cbq_class	*classes[16];		/* Hash table of all classes */
118	int			nclasses[TC_CBQ_MAXPRIO+1];
119	unsigned		quanta[TC_CBQ_MAXPRIO+1];
120
121	struct cbq_class	link;
122
123	unsigned		activemask;
124	struct cbq_class	*active[TC_CBQ_MAXPRIO+1];	/* List of all classes
125								   with backlog */
126
127#ifdef CONFIG_NET_CLS_POLICE
128	struct cbq_class	*rx_class;
129#endif
130	struct cbq_class	*tx_class;
131	struct cbq_class	*tx_borrowed;
132	int			tx_len;
133	psched_time_t		now;		/* Cached timestamp */
134	psched_time_t		now_rt;		/* Cached real time */
135	unsigned		pmask;
136
137	struct hrtimer		delay_timer;
138	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
139						   started when CBQ has
140						   backlog, but cannot
141						   transmit just now */
142	psched_tdiff_t		wd_expires;
143	int			toplevel;
144	u32			hgenerator;
145};
146
147
148#define L2T(cl,len)	((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log])
149
150
151static __inline__ unsigned cbq_hash(u32 h)
152{
153	h ^= h>>8;
154	h ^= h>>4;
155	return h&0xF;
156}
157
158static __inline__ struct cbq_class *
159cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
160{
161	struct cbq_class *cl;
162
163	for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next)
164		if (cl->classid == classid)
165			return cl;
166	return NULL;
167}
168
169#ifdef CONFIG_NET_CLS_POLICE
170
171static struct cbq_class *
172cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
173{
174	struct cbq_class *cl, *new;
175
176	for (cl = this->tparent; cl; cl = cl->tparent)
177		if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this)
178			return new;
179
180	return NULL;
181}
182
183#endif
184
185/* Classify packet. The procedure is pretty complicated, but
186   it allows us to combine link sharing and priority scheduling
187   transparently.
188
189   Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
190   so that it resolves to split nodes. Then packets are classified
191   by logical priority, or a more specific classifier may be attached
192   to the split node.
193 */
194
195static struct cbq_class *
196cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
197{
198	struct cbq_sched_data *q = qdisc_priv(sch);
199	struct cbq_class *head = &q->link;
200	struct cbq_class **defmap;
201	struct cbq_class *cl = NULL;
202	u32 prio = skb->priority;
203	struct tcf_result res;
204
205	/*
206	 *  Step 1. If skb->priority points to one of our classes, use it.
207	 */
208	if (TC_H_MAJ(prio^sch->handle) == 0 &&
209	    (cl = cbq_class_lookup(q, prio)) != NULL)
210		return cl;
211
212	*qerr = NET_XMIT_BYPASS;
213	for (;;) {
214		int result = 0;
215		defmap = head->defaults;
216
217		/*
218		 * Step 2+n. Apply classifier.
219		 */
220		if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0)
221			goto fallback;
222
223		if ((cl = (void*)res.class) == NULL) {
224			if (TC_H_MAJ(res.classid))
225				cl = cbq_class_lookup(q, res.classid);
226			else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL)
227				cl = defmap[TC_PRIO_BESTEFFORT];
228
229			if (cl == NULL || cl->level >= head->level)
230				goto fallback;
231		}
232
233#ifdef CONFIG_NET_CLS_ACT
234		switch (result) {
235		case TC_ACT_QUEUED:
236		case TC_ACT_STOLEN:
237			*qerr = NET_XMIT_SUCCESS;
238		case TC_ACT_SHOT:
239			return NULL;
240		}
241#elif defined(CONFIG_NET_CLS_POLICE)
242		switch (result) {
243		case TC_POLICE_RECLASSIFY:
244			return cbq_reclassify(skb, cl);
245		case TC_POLICE_SHOT:
246			return NULL;
247		default:
248			break;
249		}
250#endif
251		if (cl->level == 0)
252			return cl;
253
254		/*
255		 * Step 3+n. If classifier selected a link sharing class,
256		 *	   apply agency specific classifier.
257		 *	   Repeat this procdure until we hit a leaf node.
258		 */
259		head = cl;
260	}
261
262fallback:
263	cl = head;
264
265	/*
266	 * Step 4. No success...
267	 */
268	if (TC_H_MAJ(prio) == 0 &&
269	    !(cl = head->defaults[prio&TC_PRIO_MAX]) &&
270	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
271		return head;
272
273	return cl;
274}
275
276/*
277   A packet has just been enqueued on the empty class.
278   cbq_activate_class adds it to the tail of active class list
279   of its priority band.
280 */
281
282static __inline__ void cbq_activate_class(struct cbq_class *cl)
283{
284	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
285	int prio = cl->cpriority;
286	struct cbq_class *cl_tail;
287
288	cl_tail = q->active[prio];
289	q->active[prio] = cl;
290
291	if (cl_tail != NULL) {
292		cl->next_alive = cl_tail->next_alive;
293		cl_tail->next_alive = cl;
294	} else {
295		cl->next_alive = cl;
296		q->activemask |= (1<<prio);
297	}
298}
299
300/*
301   Unlink class from active chain.
302   Note that this same procedure is done directly in cbq_dequeue*
303   during round-robin procedure.
304 */
305
306static void cbq_deactivate_class(struct cbq_class *this)
307{
308	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
309	int prio = this->cpriority;
310	struct cbq_class *cl;
311	struct cbq_class *cl_prev = q->active[prio];
312
313	do {
314		cl = cl_prev->next_alive;
315		if (cl == this) {
316			cl_prev->next_alive = cl->next_alive;
317			cl->next_alive = NULL;
318
319			if (cl == q->active[prio]) {
320				q->active[prio] = cl_prev;
321				if (cl == q->active[prio]) {
322					q->active[prio] = NULL;
323					q->activemask &= ~(1<<prio);
324					return;
325				}
326			}
327			return;
328		}
329	} while ((cl_prev = cl) != q->active[prio]);
330}
331
332static void
333cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
334{
335	int toplevel = q->toplevel;
336
337	if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) {
338		psched_time_t now;
339		psched_tdiff_t incr;
340
341		now = psched_get_time();
342		incr = now - q->now_rt;
343		now = q->now + incr;
344
345		do {
346			if (cl->undertime < now) {
347				q->toplevel = cl->level;
348				return;
349			}
350		} while ((cl=cl->borrow) != NULL && toplevel > cl->level);
351	}
352}
353
354static int
355cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
356{
357	struct cbq_sched_data *q = qdisc_priv(sch);
358	int len = skb->len;
359	int ret;
360	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
361
362#ifdef CONFIG_NET_CLS_POLICE
363	q->rx_class = cl;
364#endif
365	if (cl == NULL) {
366		if (ret == NET_XMIT_BYPASS)
367			sch->qstats.drops++;
368		kfree_skb(skb);
369		return ret;
370	}
371
372#ifdef CONFIG_NET_CLS_POLICE
373	cl->q->__parent = sch;
374#endif
375	if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) {
376		sch->q.qlen++;
377		sch->bstats.packets++;
378		sch->bstats.bytes+=len;
379		cbq_mark_toplevel(q, cl);
380		if (!cl->next_alive)
381			cbq_activate_class(cl);
382		return ret;
383	}
384
385	sch->qstats.drops++;
386	cbq_mark_toplevel(q, cl);
387	cl->qstats.drops++;
388	return ret;
389}
390
391static int
392cbq_requeue(struct sk_buff *skb, struct Qdisc *sch)
393{
394	struct cbq_sched_data *q = qdisc_priv(sch);
395	struct cbq_class *cl;
396	int ret;
397
398	if ((cl = q->tx_class) == NULL) {
399		kfree_skb(skb);
400		sch->qstats.drops++;
401		return NET_XMIT_CN;
402	}
403	q->tx_class = NULL;
404
405	cbq_mark_toplevel(q, cl);
406
407#ifdef CONFIG_NET_CLS_POLICE
408	q->rx_class = cl;
409	cl->q->__parent = sch;
410#endif
411	if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) {
412		sch->q.qlen++;
413		sch->qstats.requeues++;
414		if (!cl->next_alive)
415			cbq_activate_class(cl);
416		return 0;
417	}
418	sch->qstats.drops++;
419	cl->qstats.drops++;
420	return ret;
421}
422
423/* Overlimit actions */
424
425/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */
426
427static void cbq_ovl_classic(struct cbq_class *cl)
428{
429	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
430	psched_tdiff_t delay = cl->undertime - q->now;
431
432	if (!cl->delayed) {
433		delay += cl->offtime;
434
435		/*
436		   Class goes to sleep, so that it will have no
437		   chance to work avgidle. Let's forgive it 8)
438
439		   BTW cbq-2.0 has a crap in this
440		   place, apparently they forgot to shift it by cl->ewma_log.
441		 */
442		if (cl->avgidle < 0)
443			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
444		if (cl->avgidle < cl->minidle)
445			cl->avgidle = cl->minidle;
446		if (delay <= 0)
447			delay = 1;
448		cl->undertime = q->now + delay;
449
450		cl->xstats.overactions++;
451		cl->delayed = 1;
452	}
453	if (q->wd_expires == 0 || q->wd_expires > delay)
454		q->wd_expires = delay;
455
456	/* Dirty work! We must schedule wakeups based on
457	   real available rate, rather than leaf rate,
458	   which may be tiny (even zero).
459	 */
460	if (q->toplevel == TC_CBQ_MAXLEVEL) {
461		struct cbq_class *b;
462		psched_tdiff_t base_delay = q->wd_expires;
463
464		for (b = cl->borrow; b; b = b->borrow) {
465			delay = b->undertime - q->now;
466			if (delay < base_delay) {
467				if (delay <= 0)
468					delay = 1;
469				base_delay = delay;
470			}
471		}
472
473		q->wd_expires = base_delay;
474	}
475}
476
477/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when
478   they go overlimit
479 */
480
481static void cbq_ovl_rclassic(struct cbq_class *cl)
482{
483	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
484	struct cbq_class *this = cl;
485
486	do {
487		if (cl->level > q->toplevel) {
488			cl = NULL;
489			break;
490		}
491	} while ((cl = cl->borrow) != NULL);
492
493	if (cl == NULL)
494		cl = this;
495	cbq_ovl_classic(cl);
496}
497
498/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */
499
500static void cbq_ovl_delay(struct cbq_class *cl)
501{
502	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
503	psched_tdiff_t delay = cl->undertime - q->now;
504
505	if (!cl->delayed) {
506		psched_time_t sched = q->now;
507		ktime_t expires;
508
509		delay += cl->offtime;
510		if (cl->avgidle < 0)
511			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
512		if (cl->avgidle < cl->minidle)
513			cl->avgidle = cl->minidle;
514		cl->undertime = q->now + delay;
515
516		if (delay > 0) {
517			sched += delay + cl->penalty;
518			cl->penalized = sched;
519			cl->cpriority = TC_CBQ_MAXPRIO;
520			q->pmask |= (1<<TC_CBQ_MAXPRIO);
521
522			expires = ktime_set(0, 0);
523			expires = ktime_add_ns(expires, PSCHED_US2NS(sched));
524			if (hrtimer_try_to_cancel(&q->delay_timer) &&
525			    ktime_to_ns(ktime_sub(q->delay_timer.expires,
526						  expires)) > 0)
527				q->delay_timer.expires = expires;
528			hrtimer_restart(&q->delay_timer);
529			cl->delayed = 1;
530			cl->xstats.overactions++;
531			return;
532		}
533		delay = 1;
534	}
535	if (q->wd_expires == 0 || q->wd_expires > delay)
536		q->wd_expires = delay;
537}
538
539/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */
540
541static void cbq_ovl_lowprio(struct cbq_class *cl)
542{
543	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
544
545	cl->penalized = q->now + cl->penalty;
546
547	if (cl->cpriority != cl->priority2) {
548		cl->cpriority = cl->priority2;
549		q->pmask |= (1<<cl->cpriority);
550		cl->xstats.overactions++;
551	}
552	cbq_ovl_classic(cl);
553}
554
555/* TC_CBQ_OVL_DROP: penalize class by dropping */
556
557static void cbq_ovl_drop(struct cbq_class *cl)
558{
559	if (cl->q->ops->drop)
560		if (cl->q->ops->drop(cl->q))
561			cl->qdisc->q.qlen--;
562	cl->xstats.overactions++;
563	cbq_ovl_classic(cl);
564}
565
566static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
567				       psched_time_t now)
568{
569	struct cbq_class *cl;
570	struct cbq_class *cl_prev = q->active[prio];
571	psched_time_t sched = now;
572
573	if (cl_prev == NULL)
574		return 0;
575
576	do {
577		cl = cl_prev->next_alive;
578		if (now - cl->penalized > 0) {
579			cl_prev->next_alive = cl->next_alive;
580			cl->next_alive = NULL;
581			cl->cpriority = cl->priority;
582			cl->delayed = 0;
583			cbq_activate_class(cl);
584
585			if (cl == q->active[prio]) {
586				q->active[prio] = cl_prev;
587				if (cl == q->active[prio]) {
588					q->active[prio] = NULL;
589					return 0;
590				}
591			}
592
593			cl = cl_prev->next_alive;
594		} else if (sched - cl->penalized > 0)
595			sched = cl->penalized;
596	} while ((cl_prev = cl) != q->active[prio]);
597
598	return sched - now;
599}
600
601static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
602{
603	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
604						delay_timer);
605	struct Qdisc *sch = q->watchdog.qdisc;
606	psched_time_t now;
607	psched_tdiff_t delay = 0;
608	unsigned pmask;
609
610	now = psched_get_time();
611
612	pmask = q->pmask;
613	q->pmask = 0;
614
615	while (pmask) {
616		int prio = ffz(~pmask);
617		psched_tdiff_t tmp;
618
619		pmask &= ~(1<<prio);
620
621		tmp = cbq_undelay_prio(q, prio, now);
622		if (tmp > 0) {
623			q->pmask |= 1<<prio;
624			if (tmp < delay || delay == 0)
625				delay = tmp;
626		}
627	}
628
629	if (delay) {
630		ktime_t time;
631
632		time = ktime_set(0, 0);
633		time = ktime_add_ns(time, PSCHED_US2NS(now + delay));
634		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS);
635	}
636
637	sch->flags &= ~TCQ_F_THROTTLED;
638	netif_schedule(sch->dev);
639	return HRTIMER_NORESTART;
640}
641
642
643#ifdef CONFIG_NET_CLS_POLICE
644
645static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
646{
647	int len = skb->len;
648	struct Qdisc *sch = child->__parent;
649	struct cbq_sched_data *q = qdisc_priv(sch);
650	struct cbq_class *cl = q->rx_class;
651
652	q->rx_class = NULL;
653
654	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
655
656		cbq_mark_toplevel(q, cl);
657
658		q->rx_class = cl;
659		cl->q->__parent = sch;
660
661		if (cl->q->enqueue(skb, cl->q) == 0) {
662			sch->q.qlen++;
663			sch->bstats.packets++;
664			sch->bstats.bytes+=len;
665			if (!cl->next_alive)
666				cbq_activate_class(cl);
667			return 0;
668		}
669		sch->qstats.drops++;
670		return 0;
671	}
672
673	sch->qstats.drops++;
674	return -1;
675}
676#endif
677
678/*
679   It is mission critical procedure.
680
681   We "regenerate" toplevel cutoff, if transmitting class
682   has backlog and it is not regulated. It is not part of
683   original CBQ description, but looks more reasonable.
684   Probably, it is wrong. This question needs further investigation.
685*/
686
687static __inline__ void
688cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
689		    struct cbq_class *borrowed)
690{
691	if (cl && q->toplevel >= borrowed->level) {
692		if (cl->q->q.qlen > 1) {
693			do {
694				if (borrowed->undertime == PSCHED_PASTPERFECT) {
695					q->toplevel = borrowed->level;
696					return;
697				}
698			} while ((borrowed=borrowed->borrow) != NULL);
699		}
700	}
701}
702
703static void
704cbq_update(struct cbq_sched_data *q)
705{
706	struct cbq_class *this = q->tx_class;
707	struct cbq_class *cl = this;
708	int len = q->tx_len;
709
710	q->tx_class = NULL;
711
712	for ( ; cl; cl = cl->share) {
713		long avgidle = cl->avgidle;
714		long idle;
715
716		cl->bstats.packets++;
717		cl->bstats.bytes += len;
718
719		/*
720		   (now - last) is total time between packet right edges.
721		   (last_pktlen/rate) is "virtual" busy time, so that
722
723			 idle = (now - last) - last_pktlen/rate
724		 */
725
726		idle = q->now - cl->last;
727		if ((unsigned long)idle > 128*1024*1024) {
728			avgidle = cl->maxidle;
729		} else {
730			idle -= L2T(cl, len);
731
732		/* true_avgidle := (1-W)*true_avgidle + W*idle,
733		   where W=2^{-ewma_log}. But cl->avgidle is scaled:
734		   cl->avgidle == true_avgidle/W,
735		   hence:
736		 */
737			avgidle += idle - (avgidle>>cl->ewma_log);
738		}
739
740		if (avgidle <= 0) {
741			/* Overlimit or at-limit */
742
743			if (avgidle < cl->minidle)
744				avgidle = cl->minidle;
745
746			cl->avgidle = avgidle;
747
748			/* Calculate expected time, when this class
749			   will be allowed to send.
750			   It will occur, when:
751			   (1-W)*true_avgidle + W*delay = 0, i.e.
752			   idle = (1/W - 1)*(-true_avgidle)
753			   or
754			   idle = (1 - W)*(-cl->avgidle);
755			 */
756			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
757
758			/*
759			   That is not all.
760			   To maintain the rate allocated to the class,
761			   we add to undertime virtual clock,
762			   necessary to complete transmitted packet.
763			   (len/phys_bandwidth has been already passed
764			   to the moment of cbq_update)
765			 */
766
767			idle -= L2T(&q->link, len);
768			idle += L2T(cl, len);
769
770			cl->undertime = q->now + idle;
771		} else {
772			/* Underlimit */
773
774			cl->undertime = PSCHED_PASTPERFECT;
775			if (avgidle > cl->maxidle)
776				cl->avgidle = cl->maxidle;
777			else
778				cl->avgidle = avgidle;
779		}
780		cl->last = q->now;
781	}
782
783	cbq_update_toplevel(q, this, q->tx_borrowed);
784}
785
786static __inline__ struct cbq_class *
787cbq_under_limit(struct cbq_class *cl)
788{
789	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
790	struct cbq_class *this_cl = cl;
791
792	if (cl->tparent == NULL)
793		return cl;
794
795	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
796		cl->delayed = 0;
797		return cl;
798	}
799
800	do {
801		/* It is very suspicious place. Now overlimit
802		   action is generated for not bounded classes
803		   only if link is completely congested.
804		   Though it is in agree with ancestor-only paradigm,
805		   it looks very stupid. Particularly,
806		   it means that this chunk of code will either
807		   never be called or result in strong amplification
808		   of burstiness. Dangerous, silly, and, however,
809		   no another solution exists.
810		 */
811		if ((cl = cl->borrow) == NULL) {
812			this_cl->qstats.overlimits++;
813			this_cl->overlimit(this_cl);
814			return NULL;
815		}
816		if (cl->level > q->toplevel)
817			return NULL;
818	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
819
820	cl->delayed = 0;
821	return cl;
822}
823
824static __inline__ struct sk_buff *
825cbq_dequeue_prio(struct Qdisc *sch, int prio)
826{
827	struct cbq_sched_data *q = qdisc_priv(sch);
828	struct cbq_class *cl_tail, *cl_prev, *cl;
829	struct sk_buff *skb;
830	int deficit;
831
832	cl_tail = cl_prev = q->active[prio];
833	cl = cl_prev->next_alive;
834
835	do {
836		deficit = 0;
837
838		/* Start round */
839		do {
840			struct cbq_class *borrow = cl;
841
842			if (cl->q->q.qlen &&
843			    (borrow = cbq_under_limit(cl)) == NULL)
844				goto skip_class;
845
846			if (cl->deficit <= 0) {
847				/* Class exhausted its allotment per
848				   this round. Switch to the next one.
849				 */
850				deficit = 1;
851				cl->deficit += cl->quantum;
852				goto next_class;
853			}
854
855			skb = cl->q->dequeue(cl->q);
856
857			/* Class did not give us any skb :-(
858			   It could occur even if cl->q->q.qlen != 0
859			   f.e. if cl->q == "tbf"
860			 */
861			if (skb == NULL)
862				goto skip_class;
863
864			cl->deficit -= skb->len;
865			q->tx_class = cl;
866			q->tx_borrowed = borrow;
867			if (borrow != cl) {
868#ifndef CBQ_XSTATS_BORROWS_BYTES
869				borrow->xstats.borrows++;
870				cl->xstats.borrows++;
871#else
872				borrow->xstats.borrows += skb->len;
873				cl->xstats.borrows += skb->len;
874#endif
875			}
876			q->tx_len = skb->len;
877
878			if (cl->deficit <= 0) {
879				q->active[prio] = cl;
880				cl = cl->next_alive;
881				cl->deficit += cl->quantum;
882			}
883			return skb;
884
885skip_class:
886			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
887				/* Class is empty or penalized.
888				   Unlink it from active chain.
889				 */
890				cl_prev->next_alive = cl->next_alive;
891				cl->next_alive = NULL;
892
893				/* Did cl_tail point to it? */
894				if (cl == cl_tail) {
895					/* Repair it! */
896					cl_tail = cl_prev;
897
898					/* Was it the last class in this band? */
899					if (cl == cl_tail) {
900						/* Kill the band! */
901						q->active[prio] = NULL;
902						q->activemask &= ~(1<<prio);
903						if (cl->q->q.qlen)
904							cbq_activate_class(cl);
905						return NULL;
906					}
907
908					q->active[prio] = cl_tail;
909				}
910				if (cl->q->q.qlen)
911					cbq_activate_class(cl);
912
913				cl = cl_prev;
914			}
915
916next_class:
917			cl_prev = cl;
918			cl = cl->next_alive;
919		} while (cl_prev != cl_tail);
920	} while (deficit);
921
922	q->active[prio] = cl_prev;
923
924	return NULL;
925}
926
927static __inline__ struct sk_buff *
928cbq_dequeue_1(struct Qdisc *sch)
929{
930	struct cbq_sched_data *q = qdisc_priv(sch);
931	struct sk_buff *skb;
932	unsigned activemask;
933
934	activemask = q->activemask&0xFF;
935	while (activemask) {
936		int prio = ffz(~activemask);
937		activemask &= ~(1<<prio);
938		skb = cbq_dequeue_prio(sch, prio);
939		if (skb)
940			return skb;
941	}
942	return NULL;
943}
944
945static struct sk_buff *
946cbq_dequeue(struct Qdisc *sch)
947{
948	struct sk_buff *skb;
949	struct cbq_sched_data *q = qdisc_priv(sch);
950	psched_time_t now;
951	psched_tdiff_t incr;
952
953	now = psched_get_time();
954	incr = now - q->now_rt;
955
956	if (q->tx_class) {
957		psched_tdiff_t incr2;
958		/* Time integrator. We calculate EOS time
959		   by adding expected packet transmission time.
960		   If real time is greater, we warp artificial clock,
961		   so that:
962
963		   cbq_time = max(real_time, work);
964		 */
965		incr2 = L2T(&q->link, q->tx_len);
966		q->now += incr2;
967		cbq_update(q);
968		if ((incr -= incr2) < 0)
969			incr = 0;
970	}
971	q->now += incr;
972	q->now_rt = now;
973
974	for (;;) {
975		q->wd_expires = 0;
976
977		skb = cbq_dequeue_1(sch);
978		if (skb) {
979			sch->q.qlen--;
980			sch->flags &= ~TCQ_F_THROTTLED;
981			return skb;
982		}
983
984		/* All the classes are overlimit.
985
986		   It is possible, if:
987
988		   1. Scheduler is empty.
989		   2. Toplevel cutoff inhibited borrowing.
990		   3. Root class is overlimit.
991
992		   Reset 2d and 3d conditions and retry.
993
994		   Note, that NS and cbq-2.0 are buggy, peeking
995		   an arbitrary class is appropriate for ancestor-only
996		   sharing, but not for toplevel algorithm.
997
998		   Our version is better, but slower, because it requires
999		   two passes, but it is unavoidable with top-level sharing.
1000		*/
1001
1002		if (q->toplevel == TC_CBQ_MAXLEVEL &&
1003		    q->link.undertime == PSCHED_PASTPERFECT)
1004			break;
1005
1006		q->toplevel = TC_CBQ_MAXLEVEL;
1007		q->link.undertime = PSCHED_PASTPERFECT;
1008	}
1009
1010	/* No packets in scheduler or nobody wants to give them to us :-(
1011	   Sigh... start watchdog timer in the last case. */
1012
1013	if (sch->q.qlen) {
1014		sch->qstats.overlimits++;
1015		if (q->wd_expires)
1016			qdisc_watchdog_schedule(&q->watchdog,
1017						now + q->wd_expires);
1018	}
1019	return NULL;
1020}
1021
1022/* CBQ class maintanance routines */
1023
1024static void cbq_adjust_levels(struct cbq_class *this)
1025{
1026	if (this == NULL)
1027		return;
1028
1029	do {
1030		int level = 0;
1031		struct cbq_class *cl;
1032
1033		if ((cl = this->children) != NULL) {
1034			do {
1035				if (cl->level > level)
1036					level = cl->level;
1037			} while ((cl = cl->sibling) != this->children);
1038		}
1039		this->level = level+1;
1040	} while ((this = this->tparent) != NULL);
1041}
1042
1043static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
1044{
1045	struct cbq_class *cl;
1046	unsigned h;
1047
1048	if (q->quanta[prio] == 0)
1049		return;
1050
1051	for (h=0; h<16; h++) {
1052		for (cl = q->classes[h]; cl; cl = cl->next) {
1053			/* BUGGGG... Beware! This expression suffer of
1054			   arithmetic overflows!
1055			 */
1056			if (cl->priority == prio) {
1057				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
1058					q->quanta[prio];
1059			}
1060			if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) {
1061				printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum);
1062				cl->quantum = cl->qdisc->dev->mtu/2 + 1;
1063			}
1064		}
1065	}
1066}
1067
1068static void cbq_sync_defmap(struct cbq_class *cl)
1069{
1070	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1071	struct cbq_class *split = cl->split;
1072	unsigned h;
1073	int i;
1074
1075	if (split == NULL)
1076		return;
1077
1078	for (i=0; i<=TC_PRIO_MAX; i++) {
1079		if (split->defaults[i] == cl && !(cl->defmap&(1<<i)))
1080			split->defaults[i] = NULL;
1081	}
1082
1083	for (i=0; i<=TC_PRIO_MAX; i++) {
1084		int level = split->level;
1085
1086		if (split->defaults[i])
1087			continue;
1088
1089		for (h=0; h<16; h++) {
1090			struct cbq_class *c;
1091
1092			for (c = q->classes[h]; c; c = c->next) {
1093				if (c->split == split && c->level < level &&
1094				    c->defmap&(1<<i)) {
1095					split->defaults[i] = c;
1096					level = c->level;
1097				}
1098			}
1099		}
1100	}
1101}
1102
1103static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
1104{
1105	struct cbq_class *split = NULL;
1106
1107	if (splitid == 0) {
1108		if ((split = cl->split) == NULL)
1109			return;
1110		splitid = split->classid;
1111	}
1112
1113	if (split == NULL || split->classid != splitid) {
1114		for (split = cl->tparent; split; split = split->tparent)
1115			if (split->classid == splitid)
1116				break;
1117	}
1118
1119	if (split == NULL)
1120		return;
1121
1122	if (cl->split != split) {
1123		cl->defmap = 0;
1124		cbq_sync_defmap(cl);
1125		cl->split = split;
1126		cl->defmap = def&mask;
1127	} else
1128		cl->defmap = (cl->defmap&~mask)|(def&mask);
1129
1130	cbq_sync_defmap(cl);
1131}
1132
1133static void cbq_unlink_class(struct cbq_class *this)
1134{
1135	struct cbq_class *cl, **clp;
1136	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1137
1138	for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) {
1139		if (cl == this) {
1140			*clp = cl->next;
1141			cl->next = NULL;
1142			break;
1143		}
1144	}
1145
1146	if (this->tparent) {
1147		clp=&this->sibling;
1148		cl = *clp;
1149		do {
1150			if (cl == this) {
1151				*clp = cl->sibling;
1152				break;
1153			}
1154			clp = &cl->sibling;
1155		} while ((cl = *clp) != this->sibling);
1156
1157		if (this->tparent->children == this) {
1158			this->tparent->children = this->sibling;
1159			if (this->sibling == this)
1160				this->tparent->children = NULL;
1161		}
1162	} else {
1163		BUG_TRAP(this->sibling == this);
1164	}
1165}
1166
1167static void cbq_link_class(struct cbq_class *this)
1168{
1169	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1170	unsigned h = cbq_hash(this->classid);
1171	struct cbq_class *parent = this->tparent;
1172
1173	this->sibling = this;
1174	this->next = q->classes[h];
1175	q->classes[h] = this;
1176
1177	if (parent == NULL)
1178		return;
1179
1180	if (parent->children == NULL) {
1181		parent->children = this;
1182	} else {
1183		this->sibling = parent->children->sibling;
1184		parent->children->sibling = this;
1185	}
1186}
1187
1188static unsigned int cbq_drop(struct Qdisc* sch)
1189{
1190	struct cbq_sched_data *q = qdisc_priv(sch);
1191	struct cbq_class *cl, *cl_head;
1192	int prio;
1193	unsigned int len;
1194
1195	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
1196		if ((cl_head = q->active[prio]) == NULL)
1197			continue;
1198
1199		cl = cl_head;
1200		do {
1201			if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
1202				sch->q.qlen--;
1203				if (!cl->q->q.qlen)
1204					cbq_deactivate_class(cl);
1205				return len;
1206			}
1207		} while ((cl = cl->next_alive) != cl_head);
1208	}
1209	return 0;
1210}
1211
1212static void
1213cbq_reset(struct Qdisc* sch)
1214{
1215	struct cbq_sched_data *q = qdisc_priv(sch);
1216	struct cbq_class *cl;
1217	int prio;
1218	unsigned h;
1219
1220	q->activemask = 0;
1221	q->pmask = 0;
1222	q->tx_class = NULL;
1223	q->tx_borrowed = NULL;
1224	qdisc_watchdog_cancel(&q->watchdog);
1225	hrtimer_cancel(&q->delay_timer);
1226	q->toplevel = TC_CBQ_MAXLEVEL;
1227	q->now = psched_get_time();
1228	q->now_rt = q->now;
1229
1230	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1231		q->active[prio] = NULL;
1232
1233	for (h = 0; h < 16; h++) {
1234		for (cl = q->classes[h]; cl; cl = cl->next) {
1235			qdisc_reset(cl->q);
1236
1237			cl->next_alive = NULL;
1238			cl->undertime = PSCHED_PASTPERFECT;
1239			cl->avgidle = cl->maxidle;
1240			cl->deficit = cl->quantum;
1241			cl->cpriority = cl->priority;
1242		}
1243	}
1244	sch->q.qlen = 0;
1245}
1246
1247
1248static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1249{
1250	if (lss->change&TCF_CBQ_LSS_FLAGS) {
1251		cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1252		cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1253	}
1254	if (lss->change&TCF_CBQ_LSS_EWMA)
1255		cl->ewma_log = lss->ewma_log;
1256	if (lss->change&TCF_CBQ_LSS_AVPKT)
1257		cl->avpkt = lss->avpkt;
1258	if (lss->change&TCF_CBQ_LSS_MINIDLE)
1259		cl->minidle = -(long)lss->minidle;
1260	if (lss->change&TCF_CBQ_LSS_MAXIDLE) {
1261		cl->maxidle = lss->maxidle;
1262		cl->avgidle = lss->maxidle;
1263	}
1264	if (lss->change&TCF_CBQ_LSS_OFFTIME)
1265		cl->offtime = lss->offtime;
1266	return 0;
1267}
1268
1269static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1270{
1271	q->nclasses[cl->priority]--;
1272	q->quanta[cl->priority] -= cl->weight;
1273	cbq_normalize_quanta(q, cl->priority);
1274}
1275
1276static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1277{
1278	q->nclasses[cl->priority]++;
1279	q->quanta[cl->priority] += cl->weight;
1280	cbq_normalize_quanta(q, cl->priority);
1281}
1282
1283static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1284{
1285	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1286
1287	if (wrr->allot)
1288		cl->allot = wrr->allot;
1289	if (wrr->weight)
1290		cl->weight = wrr->weight;
1291	if (wrr->priority) {
1292		cl->priority = wrr->priority-1;
1293		cl->cpriority = cl->priority;
1294		if (cl->priority >= cl->priority2)
1295			cl->priority2 = TC_CBQ_MAXPRIO-1;
1296	}
1297
1298	cbq_addprio(q, cl);
1299	return 0;
1300}
1301
1302static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl)
1303{
1304	switch (ovl->strategy) {
1305	case TC_CBQ_OVL_CLASSIC:
1306		cl->overlimit = cbq_ovl_classic;
1307		break;
1308	case TC_CBQ_OVL_DELAY:
1309		cl->overlimit = cbq_ovl_delay;
1310		break;
1311	case TC_CBQ_OVL_LOWPRIO:
1312		if (ovl->priority2-1 >= TC_CBQ_MAXPRIO ||
1313		    ovl->priority2-1 <= cl->priority)
1314			return -EINVAL;
1315		cl->priority2 = ovl->priority2-1;
1316		cl->overlimit = cbq_ovl_lowprio;
1317		break;
1318	case TC_CBQ_OVL_DROP:
1319		cl->overlimit = cbq_ovl_drop;
1320		break;
1321	case TC_CBQ_OVL_RCLASSIC:
1322		cl->overlimit = cbq_ovl_rclassic;
1323		break;
1324	default:
1325		return -EINVAL;
1326	}
1327	cl->penalty = ovl->penalty;
1328	return 0;
1329}
1330
1331#ifdef CONFIG_NET_CLS_POLICE
1332static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
1333{
1334	cl->police = p->police;
1335
1336	if (cl->q->handle) {
1337		if (p->police == TC_POLICE_RECLASSIFY)
1338			cl->q->reshape_fail = cbq_reshape_fail;
1339		else
1340			cl->q->reshape_fail = NULL;
1341	}
1342	return 0;
1343}
1344#endif
1345
1346static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1347{
1348	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1349	return 0;
1350}
1351
1352static int cbq_init(struct Qdisc *sch, struct rtattr *opt)
1353{
1354	struct cbq_sched_data *q = qdisc_priv(sch);
1355	struct rtattr *tb[TCA_CBQ_MAX];
1356	struct tc_ratespec *r;
1357
1358	if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 ||
1359	    tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1360	    RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1361		return -EINVAL;
1362
1363	if (tb[TCA_CBQ_LSSOPT-1] &&
1364	    RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1365		return -EINVAL;
1366
1367	r = RTA_DATA(tb[TCA_CBQ_RATE-1]);
1368
1369	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL)
1370		return -EINVAL;
1371
1372	q->link.refcnt = 1;
1373	q->link.sibling = &q->link;
1374	q->link.classid = sch->handle;
1375	q->link.qdisc = sch;
1376	if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1377					    sch->handle)))
1378		q->link.q = &noop_qdisc;
1379
1380	q->link.priority = TC_CBQ_MAXPRIO-1;
1381	q->link.priority2 = TC_CBQ_MAXPRIO-1;
1382	q->link.cpriority = TC_CBQ_MAXPRIO-1;
1383	q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC;
1384	q->link.overlimit = cbq_ovl_classic;
1385	q->link.allot = psched_mtu(sch->dev);
1386	q->link.quantum = q->link.allot;
1387	q->link.weight = q->link.R_tab->rate.rate;
1388
1389	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1390	q->link.avpkt = q->link.allot/2;
1391	q->link.minidle = -0x7FFFFFFF;
1392	q->link.stats_lock = &sch->dev->queue_lock;
1393
1394	qdisc_watchdog_init(&q->watchdog, sch);
1395	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
1396	q->delay_timer.function = cbq_undelay;
1397	q->toplevel = TC_CBQ_MAXLEVEL;
1398	q->now = psched_get_time();
1399	q->now_rt = q->now;
1400
1401	cbq_link_class(&q->link);
1402
1403	if (tb[TCA_CBQ_LSSOPT-1])
1404		cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1405
1406	cbq_addprio(q, &q->link);
1407	return 0;
1408}
1409
1410static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1411{
1412	unsigned char *b = skb_tail_pointer(skb);
1413
1414	RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate);
1415	return skb->len;
1416
1417rtattr_failure:
1418	nlmsg_trim(skb, b);
1419	return -1;
1420}
1421
1422static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1423{
1424	unsigned char *b = skb_tail_pointer(skb);
1425	struct tc_cbq_lssopt opt;
1426
1427	opt.flags = 0;
1428	if (cl->borrow == NULL)
1429		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1430	if (cl->share == NULL)
1431		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1432	opt.ewma_log = cl->ewma_log;
1433	opt.level = cl->level;
1434	opt.avpkt = cl->avpkt;
1435	opt.maxidle = cl->maxidle;
1436	opt.minidle = (u32)(-cl->minidle);
1437	opt.offtime = cl->offtime;
1438	opt.change = ~0;
1439	RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt);
1440	return skb->len;
1441
1442rtattr_failure:
1443	nlmsg_trim(skb, b);
1444	return -1;
1445}
1446
1447static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1448{
1449	unsigned char *b = skb_tail_pointer(skb);
1450	struct tc_cbq_wrropt opt;
1451
1452	opt.flags = 0;
1453	opt.allot = cl->allot;
1454	opt.priority = cl->priority+1;
1455	opt.cpriority = cl->cpriority+1;
1456	opt.weight = cl->weight;
1457	RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt);
1458	return skb->len;
1459
1460rtattr_failure:
1461	nlmsg_trim(skb, b);
1462	return -1;
1463}
1464
1465static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl)
1466{
1467	unsigned char *b = skb_tail_pointer(skb);
1468	struct tc_cbq_ovl opt;
1469
1470	opt.strategy = cl->ovl_strategy;
1471	opt.priority2 = cl->priority2+1;
1472	opt.pad = 0;
1473	opt.penalty = cl->penalty;
1474	RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt);
1475	return skb->len;
1476
1477rtattr_failure:
1478	nlmsg_trim(skb, b);
1479	return -1;
1480}
1481
1482static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1483{
1484	unsigned char *b = skb_tail_pointer(skb);
1485	struct tc_cbq_fopt opt;
1486
1487	if (cl->split || cl->defmap) {
1488		opt.split = cl->split ? cl->split->classid : 0;
1489		opt.defmap = cl->defmap;
1490		opt.defchange = ~0;
1491		RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt);
1492	}
1493	return skb->len;
1494
1495rtattr_failure:
1496	nlmsg_trim(skb, b);
1497	return -1;
1498}
1499
1500#ifdef CONFIG_NET_CLS_POLICE
1501static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
1502{
1503	unsigned char *b = skb_tail_pointer(skb);
1504	struct tc_cbq_police opt;
1505
1506	if (cl->police) {
1507		opt.police = cl->police;
1508		opt.__res1 = 0;
1509		opt.__res2 = 0;
1510		RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt);
1511	}
1512	return skb->len;
1513
1514rtattr_failure:
1515	nlmsg_trim(skb, b);
1516	return -1;
1517}
1518#endif
1519
1520static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1521{
1522	if (cbq_dump_lss(skb, cl) < 0 ||
1523	    cbq_dump_rate(skb, cl) < 0 ||
1524	    cbq_dump_wrr(skb, cl) < 0 ||
1525	    cbq_dump_ovl(skb, cl) < 0 ||
1526#ifdef CONFIG_NET_CLS_POLICE
1527	    cbq_dump_police(skb, cl) < 0 ||
1528#endif
1529	    cbq_dump_fopt(skb, cl) < 0)
1530		return -1;
1531	return 0;
1532}
1533
1534static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1535{
1536	struct cbq_sched_data *q = qdisc_priv(sch);
1537	unsigned char *b = skb_tail_pointer(skb);
1538	struct rtattr *rta;
1539
1540	rta = (struct rtattr*)b;
1541	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1542	if (cbq_dump_attr(skb, &q->link) < 0)
1543		goto rtattr_failure;
1544	rta->rta_len = skb_tail_pointer(skb) - b;
1545	return skb->len;
1546
1547rtattr_failure:
1548	nlmsg_trim(skb, b);
1549	return -1;
1550}
1551
1552static int
1553cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1554{
1555	struct cbq_sched_data *q = qdisc_priv(sch);
1556
1557	q->link.xstats.avgidle = q->link.avgidle;
1558	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1559}
1560
1561static int
1562cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1563	       struct sk_buff *skb, struct tcmsg *tcm)
1564{
1565	struct cbq_class *cl = (struct cbq_class*)arg;
1566	unsigned char *b = skb_tail_pointer(skb);
1567	struct rtattr *rta;
1568
1569	if (cl->tparent)
1570		tcm->tcm_parent = cl->tparent->classid;
1571	else
1572		tcm->tcm_parent = TC_H_ROOT;
1573	tcm->tcm_handle = cl->classid;
1574	tcm->tcm_info = cl->q->handle;
1575
1576	rta = (struct rtattr*)b;
1577	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1578	if (cbq_dump_attr(skb, cl) < 0)
1579		goto rtattr_failure;
1580	rta->rta_len = skb_tail_pointer(skb) - b;
1581	return skb->len;
1582
1583rtattr_failure:
1584	nlmsg_trim(skb, b);
1585	return -1;
1586}
1587
1588static int
1589cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1590	struct gnet_dump *d)
1591{
1592	struct cbq_sched_data *q = qdisc_priv(sch);
1593	struct cbq_class *cl = (struct cbq_class*)arg;
1594
1595	cl->qstats.qlen = cl->q->q.qlen;
1596	cl->xstats.avgidle = cl->avgidle;
1597	cl->xstats.undertime = 0;
1598
1599	if (cl->undertime != PSCHED_PASTPERFECT)
1600		cl->xstats.undertime = cl->undertime - q->now;
1601
1602	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1603#ifdef CONFIG_NET_ESTIMATOR
1604	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1605#endif
1606	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1607		return -1;
1608
1609	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1610}
1611
1612static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1613		     struct Qdisc **old)
1614{
1615	struct cbq_class *cl = (struct cbq_class*)arg;
1616
1617	if (cl) {
1618		if (new == NULL) {
1619			if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1620						     cl->classid)) == NULL)
1621				return -ENOBUFS;
1622		} else {
1623#ifdef CONFIG_NET_CLS_POLICE
1624			if (cl->police == TC_POLICE_RECLASSIFY)
1625				new->reshape_fail = cbq_reshape_fail;
1626#endif
1627		}
1628		sch_tree_lock(sch);
1629		*old = xchg(&cl->q, new);
1630		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1631		qdisc_reset(*old);
1632		sch_tree_unlock(sch);
1633
1634		return 0;
1635	}
1636	return -ENOENT;
1637}
1638
1639static struct Qdisc *
1640cbq_leaf(struct Qdisc *sch, unsigned long arg)
1641{
1642	struct cbq_class *cl = (struct cbq_class*)arg;
1643
1644	return cl ? cl->q : NULL;
1645}
1646
1647static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1648{
1649	struct cbq_class *cl = (struct cbq_class *)arg;
1650
1651	if (cl->q->q.qlen == 0)
1652		cbq_deactivate_class(cl);
1653}
1654
1655static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
1656{
1657	struct cbq_sched_data *q = qdisc_priv(sch);
1658	struct cbq_class *cl = cbq_class_lookup(q, classid);
1659
1660	if (cl) {
1661		cl->refcnt++;
1662		return (unsigned long)cl;
1663	}
1664	return 0;
1665}
1666
1667static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1668{
1669	struct cbq_sched_data *q = qdisc_priv(sch);
1670
1671	BUG_TRAP(!cl->filters);
1672
1673	tcf_destroy_chain(cl->filter_list);
1674	qdisc_destroy(cl->q);
1675	qdisc_put_rtab(cl->R_tab);
1676#ifdef CONFIG_NET_ESTIMATOR
1677	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1678#endif
1679	if (cl != &q->link)
1680		kfree(cl);
1681}
1682
1683static void
1684cbq_destroy(struct Qdisc* sch)
1685{
1686	struct cbq_sched_data *q = qdisc_priv(sch);
1687	struct cbq_class *cl;
1688	unsigned h;
1689
1690#ifdef CONFIG_NET_CLS_POLICE
1691	q->rx_class = NULL;
1692#endif
1693	/*
1694	 * Filters must be destroyed first because we don't destroy the
1695	 * classes from root to leafs which means that filters can still
1696	 * be bound to classes which have been destroyed already. --TGR '04
1697	 */
1698	for (h = 0; h < 16; h++) {
1699		for (cl = q->classes[h]; cl; cl = cl->next) {
1700			tcf_destroy_chain(cl->filter_list);
1701			cl->filter_list = NULL;
1702		}
1703	}
1704	for (h = 0; h < 16; h++) {
1705		struct cbq_class *next;
1706
1707		for (cl = q->classes[h]; cl; cl = next) {
1708			next = cl->next;
1709			cbq_destroy_class(sch, cl);
1710		}
1711	}
1712}
1713
1714static void cbq_put(struct Qdisc *sch, unsigned long arg)
1715{
1716	struct cbq_class *cl = (struct cbq_class*)arg;
1717
1718	if (--cl->refcnt == 0) {
1719#ifdef CONFIG_NET_CLS_POLICE
1720		struct cbq_sched_data *q = qdisc_priv(sch);
1721
1722		spin_lock_bh(&sch->dev->queue_lock);
1723		if (q->rx_class == cl)
1724			q->rx_class = NULL;
1725		spin_unlock_bh(&sch->dev->queue_lock);
1726#endif
1727
1728		cbq_destroy_class(sch, cl);
1729	}
1730}
1731
1732static int
1733cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca,
1734		 unsigned long *arg)
1735{
1736	int err;
1737	struct cbq_sched_data *q = qdisc_priv(sch);
1738	struct cbq_class *cl = (struct cbq_class*)*arg;
1739	struct rtattr *opt = tca[TCA_OPTIONS-1];
1740	struct rtattr *tb[TCA_CBQ_MAX];
1741	struct cbq_class *parent;
1742	struct qdisc_rate_table *rtab = NULL;
1743
1744	if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt))
1745		return -EINVAL;
1746
1747	if (tb[TCA_CBQ_OVL_STRATEGY-1] &&
1748	    RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl))
1749		return -EINVAL;
1750
1751	if (tb[TCA_CBQ_FOPT-1] &&
1752	    RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt))
1753		return -EINVAL;
1754
1755	if (tb[TCA_CBQ_RATE-1] &&
1756	    RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec))
1757			return -EINVAL;
1758
1759	if (tb[TCA_CBQ_LSSOPT-1] &&
1760	    RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt))
1761			return -EINVAL;
1762
1763	if (tb[TCA_CBQ_WRROPT-1] &&
1764	    RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt))
1765			return -EINVAL;
1766
1767#ifdef CONFIG_NET_CLS_POLICE
1768	if (tb[TCA_CBQ_POLICE-1] &&
1769	    RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police))
1770			return -EINVAL;
1771#endif
1772
1773	if (cl) {
1774		/* Check parent */
1775		if (parentid) {
1776			if (cl->tparent && cl->tparent->classid != parentid)
1777				return -EINVAL;
1778			if (!cl->tparent && parentid != TC_H_ROOT)
1779				return -EINVAL;
1780		}
1781
1782		if (tb[TCA_CBQ_RATE-1]) {
1783			rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1784			if (rtab == NULL)
1785				return -EINVAL;
1786		}
1787
1788		/* Change class parameters */
1789		sch_tree_lock(sch);
1790
1791		if (cl->next_alive != NULL)
1792			cbq_deactivate_class(cl);
1793
1794		if (rtab) {
1795			rtab = xchg(&cl->R_tab, rtab);
1796			qdisc_put_rtab(rtab);
1797		}
1798
1799		if (tb[TCA_CBQ_LSSOPT-1])
1800			cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1801
1802		if (tb[TCA_CBQ_WRROPT-1]) {
1803			cbq_rmprio(q, cl);
1804			cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1805		}
1806
1807		if (tb[TCA_CBQ_OVL_STRATEGY-1])
1808			cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1809
1810#ifdef CONFIG_NET_CLS_POLICE
1811		if (tb[TCA_CBQ_POLICE-1])
1812			cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1813#endif
1814
1815		if (tb[TCA_CBQ_FOPT-1])
1816			cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1817
1818		if (cl->q->q.qlen)
1819			cbq_activate_class(cl);
1820
1821		sch_tree_unlock(sch);
1822
1823#ifdef CONFIG_NET_ESTIMATOR
1824		if (tca[TCA_RATE-1])
1825			gen_replace_estimator(&cl->bstats, &cl->rate_est,
1826				cl->stats_lock, tca[TCA_RATE-1]);
1827#endif
1828		return 0;
1829	}
1830
1831	if (parentid == TC_H_ROOT)
1832		return -EINVAL;
1833
1834	if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL ||
1835	    tb[TCA_CBQ_LSSOPT-1] == NULL)
1836		return -EINVAL;
1837
1838	rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]);
1839	if (rtab == NULL)
1840		return -EINVAL;
1841
1842	if (classid) {
1843		err = -EINVAL;
1844		if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid))
1845			goto failure;
1846	} else {
1847		int i;
1848		classid = TC_H_MAKE(sch->handle,0x8000);
1849
1850		for (i=0; i<0x8000; i++) {
1851			if (++q->hgenerator >= 0x8000)
1852				q->hgenerator = 1;
1853			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1854				break;
1855		}
1856		err = -ENOSR;
1857		if (i >= 0x8000)
1858			goto failure;
1859		classid = classid|q->hgenerator;
1860	}
1861
1862	parent = &q->link;
1863	if (parentid) {
1864		parent = cbq_class_lookup(q, parentid);
1865		err = -EINVAL;
1866		if (parent == NULL)
1867			goto failure;
1868	}
1869
1870	err = -ENOBUFS;
1871	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1872	if (cl == NULL)
1873		goto failure;
1874	cl->R_tab = rtab;
1875	rtab = NULL;
1876	cl->refcnt = 1;
1877	if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid)))
1878		cl->q = &noop_qdisc;
1879	cl->classid = classid;
1880	cl->tparent = parent;
1881	cl->qdisc = sch;
1882	cl->allot = parent->allot;
1883	cl->quantum = cl->allot;
1884	cl->weight = cl->R_tab->rate.rate;
1885	cl->stats_lock = &sch->dev->queue_lock;
1886
1887	sch_tree_lock(sch);
1888	cbq_link_class(cl);
1889	cl->borrow = cl->tparent;
1890	if (cl->tparent != &q->link)
1891		cl->share = cl->tparent;
1892	cbq_adjust_levels(parent);
1893	cl->minidle = -0x7FFFFFFF;
1894	cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1]));
1895	cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1]));
1896	if (cl->ewma_log==0)
1897		cl->ewma_log = q->link.ewma_log;
1898	if (cl->maxidle==0)
1899		cl->maxidle = q->link.maxidle;
1900	if (cl->avpkt==0)
1901		cl->avpkt = q->link.avpkt;
1902	cl->overlimit = cbq_ovl_classic;
1903	if (tb[TCA_CBQ_OVL_STRATEGY-1])
1904		cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1]));
1905#ifdef CONFIG_NET_CLS_POLICE
1906	if (tb[TCA_CBQ_POLICE-1])
1907		cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1]));
1908#endif
1909	if (tb[TCA_CBQ_FOPT-1])
1910		cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1]));
1911	sch_tree_unlock(sch);
1912
1913#ifdef CONFIG_NET_ESTIMATOR
1914	if (tca[TCA_RATE-1])
1915		gen_new_estimator(&cl->bstats, &cl->rate_est,
1916			cl->stats_lock, tca[TCA_RATE-1]);
1917#endif
1918
1919	*arg = (unsigned long)cl;
1920	return 0;
1921
1922failure:
1923	qdisc_put_rtab(rtab);
1924	return err;
1925}
1926
1927static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1928{
1929	struct cbq_sched_data *q = qdisc_priv(sch);
1930	struct cbq_class *cl = (struct cbq_class*)arg;
1931	unsigned int qlen;
1932
1933	if (cl->filters || cl->children || cl == &q->link)
1934		return -EBUSY;
1935
1936	sch_tree_lock(sch);
1937
1938	qlen = cl->q->q.qlen;
1939	qdisc_reset(cl->q);
1940	qdisc_tree_decrease_qlen(cl->q, qlen);
1941
1942	if (cl->next_alive)
1943		cbq_deactivate_class(cl);
1944
1945	if (q->tx_borrowed == cl)
1946		q->tx_borrowed = q->tx_class;
1947	if (q->tx_class == cl) {
1948		q->tx_class = NULL;
1949		q->tx_borrowed = NULL;
1950	}
1951#ifdef CONFIG_NET_CLS_POLICE
1952	if (q->rx_class == cl)
1953		q->rx_class = NULL;
1954#endif
1955
1956	cbq_unlink_class(cl);
1957	cbq_adjust_levels(cl->tparent);
1958	cl->defmap = 0;
1959	cbq_sync_defmap(cl);
1960
1961	cbq_rmprio(q, cl);
1962	sch_tree_unlock(sch);
1963
1964	if (--cl->refcnt == 0)
1965		cbq_destroy_class(sch, cl);
1966
1967	return 0;
1968}
1969
1970static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg)
1971{
1972	struct cbq_sched_data *q = qdisc_priv(sch);
1973	struct cbq_class *cl = (struct cbq_class *)arg;
1974
1975	if (cl == NULL)
1976		cl = &q->link;
1977
1978	return &cl->filter_list;
1979}
1980
1981static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1982				     u32 classid)
1983{
1984	struct cbq_sched_data *q = qdisc_priv(sch);
1985	struct cbq_class *p = (struct cbq_class*)parent;
1986	struct cbq_class *cl = cbq_class_lookup(q, classid);
1987
1988	if (cl) {
1989		if (p && p->level <= cl->level)
1990			return 0;
1991		cl->filters++;
1992		return (unsigned long)cl;
1993	}
1994	return 0;
1995}
1996
1997static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1998{
1999	struct cbq_class *cl = (struct cbq_class*)arg;
2000
2001	cl->filters--;
2002}
2003
2004static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
2005{
2006	struct cbq_sched_data *q = qdisc_priv(sch);
2007	unsigned h;
2008
2009	if (arg->stop)
2010		return;
2011
2012	for (h = 0; h < 16; h++) {
2013		struct cbq_class *cl;
2014
2015		for (cl = q->classes[h]; cl; cl = cl->next) {
2016			if (arg->count < arg->skip) {
2017				arg->count++;
2018				continue;
2019			}
2020			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
2021				arg->stop = 1;
2022				return;
2023			}
2024			arg->count++;
2025		}
2026	}
2027}
2028
2029static struct Qdisc_class_ops cbq_class_ops = {
2030	.graft		=	cbq_graft,
2031	.leaf		=	cbq_leaf,
2032	.qlen_notify	=	cbq_qlen_notify,
2033	.get		=	cbq_get,
2034	.put		=	cbq_put,
2035	.change		=	cbq_change_class,
2036	.delete		=	cbq_delete,
2037	.walk		=	cbq_walk,
2038	.tcf_chain	=	cbq_find_tcf,
2039	.bind_tcf	=	cbq_bind_filter,
2040	.unbind_tcf	=	cbq_unbind_filter,
2041	.dump		=	cbq_dump_class,
2042	.dump_stats	=	cbq_dump_class_stats,
2043};
2044
2045static struct Qdisc_ops cbq_qdisc_ops = {
2046	.next		=	NULL,
2047	.cl_ops		=	&cbq_class_ops,
2048	.id		=	"cbq",
2049	.priv_size	=	sizeof(struct cbq_sched_data),
2050	.enqueue	=	cbq_enqueue,
2051	.dequeue	=	cbq_dequeue,
2052	.requeue	=	cbq_requeue,
2053	.drop		=	cbq_drop,
2054	.init		=	cbq_init,
2055	.reset		=	cbq_reset,
2056	.destroy	=	cbq_destroy,
2057	.change		=	NULL,
2058	.dump		=	cbq_dump,
2059	.dump_stats	=	cbq_dump_stats,
2060	.owner		=	THIS_MODULE,
2061};
2062
2063static int __init cbq_module_init(void)
2064{
2065	return register_qdisc(&cbq_qdisc_ops);
2066}
2067static void __exit cbq_module_exit(void)
2068{
2069	unregister_qdisc(&cbq_qdisc_ops);
2070}
2071module_init(cbq_module_init)
2072module_exit(cbq_module_exit)
2073MODULE_LICENSE("GPL");
2074