1/*
2 * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Martin Devera, <devik@cdi.cz>
10 *
11 * Credits (in time order) for older HTB versions:
12 *              Stef Coene <stef.coene@docum.org>
13 *			HTB support at LARTC mailing list
14 *		Ondrej Kraus, <krauso@barr.cz>
15 *			found missing INIT_QDISC(htb)
16 *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17 *			helped a lot to locate nasty class stall bug
18 *		Andi Kleen, Jamal Hadi, Bert Hubert
19 *			code review and helpful comments on shaping
20 *		Tomasz Wrona, <tw@eter.tym.pl>
21 *			created test case so that I was able to fix nasty bug
22 *		Wilfried Weissmann
23 *			spotted bug in dequeue code and helped with fix
24 *		Jiri Fojtasek
25 *			fixed requeue routine
26 *		and many others. thanks.
27 *
28 * $Id: sch_htb.c,v 1.1.1.1 2007/08/03 18:53:55 Exp $
29 */
30#include <linux/module.h>
31#include <asm/uaccess.h>
32#include <asm/system.h>
33#include <linux/bitops.h>
34#include <linux/types.h>
35#include <linux/kernel.h>
36#include <linux/string.h>
37#include <linux/mm.h>
38#include <linux/socket.h>
39#include <linux/sockios.h>
40#include <linux/in.h>
41#include <linux/errno.h>
42#include <linux/interrupt.h>
43#include <linux/if_ether.h>
44#include <linux/inet.h>
45#include <linux/netdevice.h>
46#include <linux/etherdevice.h>
47#include <linux/notifier.h>
48#include <net/ip.h>
49#include <net/route.h>
50#include <linux/skbuff.h>
51#include <linux/list.h>
52#include <linux/compiler.h>
53#include <net/netlink.h>
54#include <net/sock.h>
55#include <net/pkt_sched.h>
56#include <linux/rbtree.h>
57
58/* HTB algorithm.
59    Author: devik@cdi.cz
60    ========================================================================
61    HTB is like TBF with multiple classes. It is also similar to CBQ because
62    it allows to assign priority to each class in hierarchy.
63    In fact it is another implementation of Floyd's formal sharing.
64
65    Levels:
66    Each class is assigned level. Leaf has ALWAYS level 0 and root
67    classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
68    one less than their parent.
69*/
70
71#define HTB_HSIZE 16		/* classid hash size */
72#define HTB_EWMAC 2		/* rate average over HTB_EWMAC*HTB_HSIZE sec */
73#define HTB_RATECM 1		/* whether to use rate computer */
74#define HTB_HYSTERESIS 1	/* whether to use mode hysteresis for speedup */
75#define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
76
77#if HTB_VER >> 16 != TC_HTB_PROTOVER
78#error "Mismatched sch_htb.c and pkt_sch.h"
79#endif
80
81/* used internaly to keep status of single class */
82enum htb_cmode {
83	HTB_CANT_SEND,		/* class can't send and can't borrow */
84	HTB_MAY_BORROW,		/* class can't send but may borrow */
85	HTB_CAN_SEND		/* class can send */
86};
87
88/* interior & leaf nodes; props specific to leaves are marked L: */
89struct htb_class {
90	/* general class parameters */
91	u32 classid;
92	struct gnet_stats_basic bstats;
93	struct gnet_stats_queue qstats;
94	struct gnet_stats_rate_est rate_est;
95	struct tc_htb_xstats xstats;	/* our special stats */
96	int refcnt;		/* usage count of this class */
97
98#ifdef HTB_RATECM
99	/* rate measurement counters */
100	unsigned long rate_bytes, sum_bytes;
101	unsigned long rate_packets, sum_packets;
102#endif
103
104	/* topology */
105	int level;		/* our level (see above) */
106	struct htb_class *parent;	/* parent class */
107	struct hlist_node hlist;	/* classid hash list item */
108	struct list_head sibling;	/* sibling list item */
109	struct list_head children;	/* children list */
110
111	union {
112		struct htb_class_leaf {
113			struct Qdisc *q;
114			int prio;
115			int aprio;
116			int quantum;
117			int deficit[TC_HTB_MAXDEPTH];
118			struct list_head drop_list;
119		} leaf;
120		struct htb_class_inner {
121			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
122			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
123			/* When class changes from state 1->2 and disconnects from
124			   parent's feed then we lost ptr value and start from the
125			   first child again. Here we store classid of the
126			   last valid ptr (used when ptr is NULL). */
127			u32 last_ptr_id[TC_HTB_NUMPRIO];
128		} inner;
129	} un;
130	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
131	struct rb_node pq_node;	/* node for event queue */
132	psched_time_t pq_key;
133
134	int prio_activity;	/* for which prios are we active */
135	enum htb_cmode cmode;	/* current mode of the class */
136
137	/* class attached filters */
138	struct tcf_proto *filter_list;
139	int filter_cnt;
140
141	int warned;		/* only one warning about non work conserving .. */
142
143	/* token bucket parameters */
144	struct qdisc_rate_table *rate;	/* rate table of the class itself */
145	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
146	long buffer, cbuffer;	/* token bucket depth/rate */
147	psched_tdiff_t mbuffer;	/* max wait time */
148	long tokens, ctokens;	/* current number of tokens */
149	psched_time_t t_c;	/* checkpoint time */
150
151	int prio;		/* For parent to leaf return possible here */
152	int quantum;		/* we do backup. Finally full replacement  */
153				/* of un.leaf originals should be done. */
154};
155
156/* TODO: maybe compute rate when size is too large .. or drop ? */
157static inline long L2T(struct htb_class *cl, struct qdisc_rate_table *rate,
158			   int size)
159{
160	int slot = size >> rate->rate.cell_log;
161	if (slot > 255) {
162		cl->xstats.giants++;
163		slot = 255;
164	}
165	return rate->data[slot];
166}
167
168struct htb_sched {
169	struct list_head root;	/* root classes list */
170	struct hlist_head hash[HTB_HSIZE];	/* hashed by classid */
171	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
172
173	/* self list - roots of self generating tree */
174	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
175	int row_mask[TC_HTB_MAXDEPTH];
176	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
177	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
178
179	/* self wait list - roots of wait PQs per row */
180	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
181
182	/* time of nearest event per level (row) */
183	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
184
185	/* whether we hit non-work conserving class during this dequeue; we use */
186	int nwc_hit;		/* this to disable mindelay complaint in dequeue */
187
188	int defcls;		/* class where unclassified flows go to */
189
190	/* filters for qdisc itself */
191	struct tcf_proto *filter_list;
192	int filter_cnt;
193
194	int rate2quantum;	/* quant = rate / rate2quantum */
195	psched_time_t now;	/* cached dequeue time */
196	struct qdisc_watchdog watchdog;
197#ifdef HTB_RATECM
198	struct timer_list rttim;	/* rate computer timer */
199	int recmp_bucket;	/* which hash bucket to recompute next */
200#endif
201
202	/* non shaped skbs; let them go directly thru */
203	struct sk_buff_head direct_queue;
204	int direct_qlen;	/* max qlen of above */
205
206	long direct_pkts;
207};
208
209/* compute hash of size HTB_HSIZE for given handle */
210static inline int htb_hash(u32 h)
211{
212#if HTB_HSIZE != 16
213#error "Declare new hash for your HTB_HSIZE"
214#endif
215	h ^= h >> 8;		/* stolen from cbq_hash */
216	h ^= h >> 4;
217	return h & 0xf;
218}
219
220/* find class in global hash table using given handle */
221static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
222{
223	struct htb_sched *q = qdisc_priv(sch);
224	struct hlist_node *p;
225	struct htb_class *cl;
226
227	if (TC_H_MAJ(handle) != sch->handle)
228		return NULL;
229
230	hlist_for_each_entry(cl, p, q->hash + htb_hash(handle), hlist) {
231		if (cl->classid == handle)
232			return cl;
233	}
234	return NULL;
235}
236
237/**
238 * htb_classify - classify a packet into class
239 *
240 * It returns NULL if the packet should be dropped or -1 if the packet
241 * should be passed directly thru. In all other cases leaf class is returned.
242 * We allow direct class selection by classid in priority. The we examine
243 * filters in qdisc and in inner nodes (if higher filter points to the inner
244 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
245 * internal fifo (direct). These packets then go directly thru. If we still
246 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
247 * then finish and return direct queue.
248 */
249#define HTB_DIRECT (struct htb_class*)-1
250static inline u32 htb_classid(struct htb_class *cl)
251{
252	return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC;
253}
254
255static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
256				      int *qerr)
257{
258	struct htb_sched *q = qdisc_priv(sch);
259	struct htb_class *cl;
260	struct tcf_result res;
261	struct tcf_proto *tcf;
262	int result;
263
264	/* allow to select class by setting skb->priority to valid classid;
265	   note that nfmark can be used too by attaching filter fw with no
266	   rules in it */
267	if (skb->priority == sch->handle)
268		return HTB_DIRECT;	/* X:0 (direct flow) selected */
269	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
270		return cl;
271
272	*qerr = NET_XMIT_BYPASS;
273	tcf = q->filter_list;
274	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
275#ifdef CONFIG_NET_CLS_ACT
276		switch (result) {
277		case TC_ACT_QUEUED:
278		case TC_ACT_STOLEN:
279			*qerr = NET_XMIT_SUCCESS;
280		case TC_ACT_SHOT:
281			return NULL;
282		}
283#elif defined(CONFIG_NET_CLS_POLICE)
284		if (result == TC_POLICE_SHOT)
285			return HTB_DIRECT;
286#endif
287		if ((cl = (void *)res.class) == NULL) {
288			if (res.classid == sch->handle)
289				return HTB_DIRECT;	/* X:0 (direct flow) */
290			if ((cl = htb_find(res.classid, sch)) == NULL)
291				break;	/* filter selected invalid classid */
292		}
293		if (!cl->level)
294			return cl;	/* we hit leaf; return it */
295
296		/* we have got inner class; apply inner filter chain */
297		tcf = cl->filter_list;
298	}
299	/* classification failed; try to use default class */
300	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
301	if (!cl || cl->level)
302		return HTB_DIRECT;	/* bad default .. this is safe bet */
303	return cl;
304}
305
306/**
307 * htb_add_to_id_tree - adds class to the round robin list
308 *
309 * Routine adds class to the list (actually tree) sorted by classid.
310 * Make sure that class is not already on such list for given prio.
311 */
312static void htb_add_to_id_tree(struct rb_root *root,
313			       struct htb_class *cl, int prio)
314{
315	struct rb_node **p = &root->rb_node, *parent = NULL;
316
317	while (*p) {
318		struct htb_class *c;
319		parent = *p;
320		c = rb_entry(parent, struct htb_class, node[prio]);
321
322		if (cl->classid > c->classid)
323			p = &parent->rb_right;
324		else
325			p = &parent->rb_left;
326	}
327	rb_link_node(&cl->node[prio], parent, p);
328	rb_insert_color(&cl->node[prio], root);
329}
330
331/**
332 * htb_add_to_wait_tree - adds class to the event queue with delay
333 *
334 * The class is added to priority event queue to indicate that class will
335 * change its mode in cl->pq_key microseconds. Make sure that class is not
336 * already in the queue.
337 */
338static void htb_add_to_wait_tree(struct htb_sched *q,
339				 struct htb_class *cl, long delay)
340{
341	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
342
343	cl->pq_key = q->now + delay;
344	if (cl->pq_key == q->now)
345		cl->pq_key++;
346
347	/* update the nearest event cache */
348	if (q->near_ev_cache[cl->level] > cl->pq_key)
349		q->near_ev_cache[cl->level] = cl->pq_key;
350
351	while (*p) {
352		struct htb_class *c;
353		parent = *p;
354		c = rb_entry(parent, struct htb_class, pq_node);
355		if (cl->pq_key >= c->pq_key)
356			p = &parent->rb_right;
357		else
358			p = &parent->rb_left;
359	}
360	rb_link_node(&cl->pq_node, parent, p);
361	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
362}
363
364/**
365 * htb_next_rb_node - finds next node in binary tree
366 *
367 * When we are past last key we return NULL.
368 * Average complexity is 2 steps per call.
369 */
370static inline void htb_next_rb_node(struct rb_node **n)
371{
372	*n = rb_next(*n);
373}
374
375/**
376 * htb_add_class_to_row - add class to its row
377 *
378 * The class is added to row at priorities marked in mask.
379 * It does nothing if mask == 0.
380 */
381static inline void htb_add_class_to_row(struct htb_sched *q,
382					struct htb_class *cl, int mask)
383{
384	q->row_mask[cl->level] |= mask;
385	while (mask) {
386		int prio = ffz(~mask);
387		mask &= ~(1 << prio);
388		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
389	}
390}
391
392/* If this triggers, it is a bug in this code, but it need not be fatal */
393static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
394{
395	if (RB_EMPTY_NODE(rb)) {
396		WARN_ON(1);
397	} else {
398		rb_erase(rb, root);
399		RB_CLEAR_NODE(rb);
400	}
401}
402
403
404/**
405 * htb_remove_class_from_row - removes class from its row
406 *
407 * The class is removed from row at priorities marked in mask.
408 * It does nothing if mask == 0.
409 */
410static inline void htb_remove_class_from_row(struct htb_sched *q,
411						 struct htb_class *cl, int mask)
412{
413	int m = 0;
414
415	while (mask) {
416		int prio = ffz(~mask);
417
418		mask &= ~(1 << prio);
419		if (q->ptr[cl->level][prio] == cl->node + prio)
420			htb_next_rb_node(q->ptr[cl->level] + prio);
421
422		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
423		if (!q->row[cl->level][prio].rb_node)
424			m |= 1 << prio;
425	}
426	q->row_mask[cl->level] &= ~m;
427}
428
429/**
430 * htb_activate_prios - creates active classe's feed chain
431 *
432 * The class is connected to ancestors and/or appropriate rows
433 * for priorities it is participating on. cl->cmode must be new
434 * (activated) mode. It does nothing if cl->prio_activity == 0.
435 */
436static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
437{
438	struct htb_class *p = cl->parent;
439	long m, mask = cl->prio_activity;
440
441	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
442		m = mask;
443		while (m) {
444			int prio = ffz(~m);
445			m &= ~(1 << prio);
446
447			if (p->un.inner.feed[prio].rb_node)
448				/* parent already has its feed in use so that
449				   reset bit in mask as parent is already ok */
450				mask &= ~(1 << prio);
451
452			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
453		}
454		p->prio_activity |= mask;
455		cl = p;
456		p = cl->parent;
457
458	}
459	if (cl->cmode == HTB_CAN_SEND && mask)
460		htb_add_class_to_row(q, cl, mask);
461}
462
463/**
464 * htb_deactivate_prios - remove class from feed chain
465 *
466 * cl->cmode must represent old mode (before deactivation). It does
467 * nothing if cl->prio_activity == 0. Class is removed from all feed
468 * chains and rows.
469 */
470static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
471{
472	struct htb_class *p = cl->parent;
473	long m, mask = cl->prio_activity;
474
475	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
476		m = mask;
477		mask = 0;
478		while (m) {
479			int prio = ffz(~m);
480			m &= ~(1 << prio);
481
482			if (p->un.inner.ptr[prio] == cl->node + prio) {
483				/* we are removing child which is pointed to from
484				   parent feed - forget the pointer but remember
485				   classid */
486				p->un.inner.last_ptr_id[prio] = cl->classid;
487				p->un.inner.ptr[prio] = NULL;
488			}
489
490			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
491
492			if (!p->un.inner.feed[prio].rb_node)
493				mask |= 1 << prio;
494		}
495
496		p->prio_activity &= ~mask;
497		cl = p;
498		p = cl->parent;
499
500	}
501	if (cl->cmode == HTB_CAN_SEND && mask)
502		htb_remove_class_from_row(q, cl, mask);
503}
504
505#if HTB_HYSTERESIS
506static inline long htb_lowater(const struct htb_class *cl)
507{
508	return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
509}
510static inline long htb_hiwater(const struct htb_class *cl)
511{
512	return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
513}
514#else
515#define htb_lowater(cl)	(0)
516#define htb_hiwater(cl)	(0)
517#endif
518
519/**
520 * htb_class_mode - computes and returns current class mode
521 *
522 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
523 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
524 * from now to time when cl will change its state.
525 * Also it is worth to note that class mode doesn't change simply
526 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
527 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
528 * mode transitions per time unit. The speed gain is about 1/6.
529 */
530static inline enum htb_cmode
531htb_class_mode(struct htb_class *cl, long *diff)
532{
533	long toks;
534
535	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
536		*diff = -toks;
537		return HTB_CANT_SEND;
538	}
539
540	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
541		return HTB_CAN_SEND;
542
543	*diff = -toks;
544	return HTB_MAY_BORROW;
545}
546
547/**
548 * htb_change_class_mode - changes classe's mode
549 *
550 * This should be the only way how to change classe's mode under normal
551 * cirsumstances. Routine will update feed lists linkage, change mode
552 * and add class to the wait event queue if appropriate. New mode should
553 * be different from old one and cl->pq_key has to be valid if changing
554 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
555 */
556static void
557htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
558{
559	enum htb_cmode new_mode = htb_class_mode(cl, diff);
560
561	if (new_mode == cl->cmode)
562		return;
563
564	if (cl->prio_activity) {	/* not necessary: speed optimization */
565		if (cl->cmode != HTB_CANT_SEND)
566			htb_deactivate_prios(q, cl);
567		cl->cmode = new_mode;
568		if (new_mode != HTB_CANT_SEND)
569			htb_activate_prios(q, cl);
570	} else
571		cl->cmode = new_mode;
572}
573
574/**
575 * htb_activate - inserts leaf cl into appropriate active feeds
576 *
577 * Routine learns (new) priority of leaf and activates feed chain
578 * for the prio. It can be called on already active leaf safely.
579 * It also adds leaf into droplist.
580 */
581static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
582{
583	BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen);
584
585	if (!cl->prio_activity) {
586		cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio);
587		htb_activate_prios(q, cl);
588		list_add_tail(&cl->un.leaf.drop_list,
589			      q->drops + cl->un.leaf.aprio);
590	}
591}
592
593/**
594 * htb_deactivate - remove leaf cl from active feeds
595 *
596 * Make sure that leaf is active. In the other words it can't be called
597 * with non-active leaf. It also removes class from the drop list.
598 */
599static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
600{
601	BUG_TRAP(cl->prio_activity);
602
603	htb_deactivate_prios(q, cl);
604	cl->prio_activity = 0;
605	list_del_init(&cl->un.leaf.drop_list);
606}
607
608static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
609{
610	int ret;
611	struct htb_sched *q = qdisc_priv(sch);
612	struct htb_class *cl = htb_classify(skb, sch, &ret);
613
614	if (cl == HTB_DIRECT) {
615		/* enqueue to helper queue */
616		if (q->direct_queue.qlen < q->direct_qlen) {
617			__skb_queue_tail(&q->direct_queue, skb);
618			q->direct_pkts++;
619		} else {
620			kfree_skb(skb);
621			sch->qstats.drops++;
622			return NET_XMIT_DROP;
623		}
624#ifdef CONFIG_NET_CLS_ACT
625	} else if (!cl) {
626		if (ret == NET_XMIT_BYPASS)
627			sch->qstats.drops++;
628		kfree_skb(skb);
629		return ret;
630#endif
631	} else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) !=
632		   NET_XMIT_SUCCESS) {
633		sch->qstats.drops++;
634		cl->qstats.drops++;
635		return NET_XMIT_DROP;
636	} else {
637		cl->bstats.packets++;
638		cl->bstats.bytes += skb->len;
639		htb_activate(q, cl);
640	}
641
642	sch->q.qlen++;
643	sch->bstats.packets++;
644	sch->bstats.bytes += skb->len;
645	return NET_XMIT_SUCCESS;
646}
647
648/* TODO: requeuing packet charges it to policers again !! */
649static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch)
650{
651	struct htb_sched *q = qdisc_priv(sch);
652	int ret = NET_XMIT_SUCCESS;
653	struct htb_class *cl = htb_classify(skb, sch, &ret);
654	struct sk_buff *tskb;
655
656	if (cl == HTB_DIRECT || !cl) {
657		/* enqueue to helper queue */
658		if (q->direct_queue.qlen < q->direct_qlen && cl) {
659			__skb_queue_head(&q->direct_queue, skb);
660		} else {
661			__skb_queue_head(&q->direct_queue, skb);
662			tskb = __skb_dequeue_tail(&q->direct_queue);
663			kfree_skb(tskb);
664			sch->qstats.drops++;
665			return NET_XMIT_CN;
666		}
667	} else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) !=
668		   NET_XMIT_SUCCESS) {
669		sch->qstats.drops++;
670		cl->qstats.drops++;
671		return NET_XMIT_DROP;
672	} else
673		htb_activate(q, cl);
674
675	sch->q.qlen++;
676	sch->qstats.requeues++;
677	return NET_XMIT_SUCCESS;
678}
679
680#ifdef HTB_RATECM
681#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0
682static void htb_rate_timer(unsigned long arg)
683{
684	struct Qdisc *sch = (struct Qdisc *)arg;
685	struct htb_sched *q = qdisc_priv(sch);
686	struct hlist_node *p;
687	struct htb_class *cl;
688
689
690	/* lock queue so that we can muck with it */
691	spin_lock_bh(&sch->dev->queue_lock);
692
693	q->rttim.expires = jiffies + HZ;
694	add_timer(&q->rttim);
695
696	/* scan and recompute one bucket at time */
697	if (++q->recmp_bucket >= HTB_HSIZE)
698		q->recmp_bucket = 0;
699
700	hlist_for_each_entry(cl,p, q->hash + q->recmp_bucket, hlist) {
701		RT_GEN(cl->sum_bytes, cl->rate_bytes);
702		RT_GEN(cl->sum_packets, cl->rate_packets);
703	}
704	spin_unlock_bh(&sch->dev->queue_lock);
705}
706#endif
707
708/**
709 * htb_charge_class - charges amount "bytes" to leaf and ancestors
710 *
711 * Routine assumes that packet "bytes" long was dequeued from leaf cl
712 * borrowing from "level". It accounts bytes to ceil leaky bucket for
713 * leaf and all ancestors and to rate bucket for ancestors at levels
714 * "level" and higher. It also handles possible change of mode resulting
715 * from the update. Note that mode can also increase here (MAY_BORROW to
716 * CAN_SEND) because we can use more precise clock that event queue here.
717 * In such case we remove class from event queue first.
718 */
719static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
720			     int level, int bytes)
721{
722	long toks, diff;
723	enum htb_cmode old_mode;
724
725#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \
726	if (toks > cl->B) toks = cl->B; \
727	toks -= L2T(cl, cl->R, bytes); \
728	if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \
729	cl->T = toks
730
731	while (cl) {
732		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
733		if (cl->level >= level) {
734			if (cl->level == level)
735				cl->xstats.lends++;
736			HTB_ACCNT(tokens, buffer, rate);
737		} else {
738			cl->xstats.borrows++;
739			cl->tokens += diff;	/* we moved t_c; update tokens */
740		}
741		HTB_ACCNT(ctokens, cbuffer, ceil);
742		cl->t_c = q->now;
743
744		old_mode = cl->cmode;
745		diff = 0;
746		htb_change_class_mode(q, cl, &diff);
747		if (old_mode != cl->cmode) {
748			if (old_mode != HTB_CAN_SEND)
749				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
750			if (cl->cmode != HTB_CAN_SEND)
751				htb_add_to_wait_tree(q, cl, diff);
752		}
753#ifdef HTB_RATECM
754		/* update rate counters */
755		cl->sum_bytes += bytes;
756		cl->sum_packets++;
757#endif
758
759		/* update byte stats except for leaves which are already updated */
760		if (cl->level) {
761			cl->bstats.bytes += bytes;
762			cl->bstats.packets++;
763		}
764		cl = cl->parent;
765	}
766}
767
768/**
769 * htb_do_events - make mode changes to classes at the level
770 *
771 * Scans event queue for pending events and applies them. Returns time of
772 * next pending event (0 for no event in pq).
773 * Note: Applied are events whose have cl->pq_key <= q->now.
774 */
775static psched_time_t htb_do_events(struct htb_sched *q, int level)
776{
777	int i;
778
779	for (i = 0; i < 500; i++) {
780		struct htb_class *cl;
781		long diff;
782		struct rb_node *p = rb_first(&q->wait_pq[level]);
783
784		if (!p)
785			return 0;
786
787		cl = rb_entry(p, struct htb_class, pq_node);
788		if (cl->pq_key > q->now)
789			return cl->pq_key;
790
791		htb_safe_rb_erase(p, q->wait_pq + level);
792		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
793		htb_change_class_mode(q, cl, &diff);
794		if (cl->cmode != HTB_CAN_SEND)
795			htb_add_to_wait_tree(q, cl, diff);
796	}
797	if (net_ratelimit())
798		printk(KERN_WARNING "htb: too many events !\n");
799	return q->now + PSCHED_TICKS_PER_SEC / 10;
800}
801
802/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
803   is no such one exists. */
804static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
805					      u32 id)
806{
807	struct rb_node *r = NULL;
808	while (n) {
809		struct htb_class *cl =
810		    rb_entry(n, struct htb_class, node[prio]);
811		if (id == cl->classid)
812			return n;
813
814		if (id > cl->classid) {
815			n = n->rb_right;
816		} else {
817			r = n;
818			n = n->rb_left;
819		}
820	}
821	return r;
822}
823
824/**
825 * htb_lookup_leaf - returns next leaf class in DRR order
826 *
827 * Find leaf where current feed pointers points to.
828 */
829static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
830					 struct rb_node **pptr, u32 * pid)
831{
832	int i;
833	struct {
834		struct rb_node *root;
835		struct rb_node **pptr;
836		u32 *pid;
837	} stk[TC_HTB_MAXDEPTH], *sp = stk;
838
839	BUG_TRAP(tree->rb_node);
840	sp->root = tree->rb_node;
841	sp->pptr = pptr;
842	sp->pid = pid;
843
844	for (i = 0; i < 65535; i++) {
845		if (!*sp->pptr && *sp->pid) {
846			/* ptr was invalidated but id is valid - try to recover
847			   the original or next ptr */
848			*sp->pptr =
849			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
850		}
851		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
852				   can become out of date quickly */
853		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
854			*sp->pptr = sp->root;
855			while ((*sp->pptr)->rb_left)
856				*sp->pptr = (*sp->pptr)->rb_left;
857			if (sp > stk) {
858				sp--;
859				BUG_TRAP(*sp->pptr);
860				if (!*sp->pptr)
861					return NULL;
862				htb_next_rb_node(sp->pptr);
863			}
864		} else {
865			struct htb_class *cl;
866			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
867			if (!cl->level)
868				return cl;
869			(++sp)->root = cl->un.inner.feed[prio].rb_node;
870			sp->pptr = cl->un.inner.ptr + prio;
871			sp->pid = cl->un.inner.last_ptr_id + prio;
872		}
873	}
874	BUG_TRAP(0);
875	return NULL;
876}
877
878/* dequeues packet at given priority and level; call only if
879   you are sure that there is active class at prio/level */
880static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
881					int level)
882{
883	struct sk_buff *skb = NULL;
884	struct htb_class *cl, *start;
885	/* look initial class up in the row */
886	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
887				     q->ptr[level] + prio,
888				     q->last_ptr_id[level] + prio);
889
890	do {
891next:
892		BUG_TRAP(cl);
893		if (!cl)
894			return NULL;
895
896		/* class can be empty - it is unlikely but can be true if leaf
897		   qdisc drops packets in enqueue routine or if someone used
898		   graft operation on the leaf since last dequeue;
899		   simply deactivate and skip such class */
900		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
901			struct htb_class *next;
902			htb_deactivate(q, cl);
903
904			/* row/level might become empty */
905			if ((q->row_mask[level] & (1 << prio)) == 0)
906				return NULL;
907
908			next = htb_lookup_leaf(q->row[level] + prio,
909					       prio, q->ptr[level] + prio,
910					       q->last_ptr_id[level] + prio);
911
912			if (cl == start)	/* fix start if we just deleted it */
913				start = next;
914			cl = next;
915			goto next;
916		}
917
918		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
919		if (likely(skb != NULL))
920			break;
921		if (!cl->warned) {
922			printk(KERN_WARNING
923			       "htb: class %X isn't work conserving ?!\n",
924			       cl->classid);
925			cl->warned = 1;
926		}
927		q->nwc_hit++;
928		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
929				  ptr[0]) + prio);
930		cl = htb_lookup_leaf(q->row[level] + prio, prio,
931				     q->ptr[level] + prio,
932				     q->last_ptr_id[level] + prio);
933
934	} while (cl != start);
935
936	if (likely(skb != NULL)) {
937		if ((cl->un.leaf.deficit[level] -= skb->len) < 0) {
938			cl->un.leaf.deficit[level] += cl->un.leaf.quantum;
939			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
940					  ptr[0]) + prio);
941		}
942		/* this used to be after charge_class but this constelation
943		   gives us slightly better performance */
944		if (!cl->un.leaf.q->q.qlen)
945			htb_deactivate(q, cl);
946		htb_charge_class(q, cl, level, skb->len);
947	}
948	return skb;
949}
950
951static struct sk_buff *htb_dequeue(struct Qdisc *sch)
952{
953	struct sk_buff *skb = NULL;
954	struct htb_sched *q = qdisc_priv(sch);
955	int level;
956	psched_time_t next_event;
957
958	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
959	skb = __skb_dequeue(&q->direct_queue);
960	if (skb != NULL) {
961		sch->flags &= ~TCQ_F_THROTTLED;
962		sch->q.qlen--;
963		return skb;
964	}
965
966	if (!sch->q.qlen)
967		goto fin;
968	q->now = psched_get_time();
969
970	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
971	q->nwc_hit = 0;
972	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
973		/* common case optimization - skip event handler quickly */
974		int m;
975		psched_time_t event;
976
977		if (q->now >= q->near_ev_cache[level]) {
978			event = htb_do_events(q, level);
979			if (!event)
980				event = q->now + PSCHED_TICKS_PER_SEC;
981			q->near_ev_cache[level] = event;
982		} else
983			event = q->near_ev_cache[level];
984
985		if (event && next_event > event)
986			next_event = event;
987
988		m = ~q->row_mask[level];
989		while (m != (int)(-1)) {
990			int prio = ffz(m);
991			m |= 1 << prio;
992			skb = htb_dequeue_tree(q, prio, level);
993			if (likely(skb != NULL)) {
994				sch->q.qlen--;
995				sch->flags &= ~TCQ_F_THROTTLED;
996				goto fin;
997			}
998		}
999	}
1000	sch->qstats.overlimits++;
1001	qdisc_watchdog_schedule(&q->watchdog, next_event);
1002fin:
1003	return skb;
1004}
1005
1006/* try to drop from each class (by prio) until one succeed */
1007static unsigned int htb_drop(struct Qdisc *sch)
1008{
1009	struct htb_sched *q = qdisc_priv(sch);
1010	int prio;
1011
1012	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
1013		struct list_head *p;
1014		list_for_each(p, q->drops + prio) {
1015			struct htb_class *cl = list_entry(p, struct htb_class,
1016							  un.leaf.drop_list);
1017			unsigned int len;
1018			if (cl->un.leaf.q->ops->drop &&
1019			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
1020				sch->q.qlen--;
1021				if (!cl->un.leaf.q->q.qlen)
1022					htb_deactivate(q, cl);
1023				return len;
1024			}
1025		}
1026	}
1027	return 0;
1028}
1029
1030/* reset all classes */
1031/* always caled under BH & queue lock */
1032static void htb_reset(struct Qdisc *sch)
1033{
1034	struct htb_sched *q = qdisc_priv(sch);
1035	int i;
1036
1037	for (i = 0; i < HTB_HSIZE; i++) {
1038		struct hlist_node *p;
1039		struct htb_class *cl;
1040
1041		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
1042			if (cl->level)
1043				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
1044			else {
1045				if (cl->un.leaf.q)
1046					qdisc_reset(cl->un.leaf.q);
1047				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1048			}
1049			cl->prio_activity = 0;
1050			cl->cmode = HTB_CAN_SEND;
1051
1052		}
1053	}
1054	qdisc_watchdog_cancel(&q->watchdog);
1055	__skb_queue_purge(&q->direct_queue);
1056	sch->q.qlen = 0;
1057	memset(q->row, 0, sizeof(q->row));
1058	memset(q->row_mask, 0, sizeof(q->row_mask));
1059	memset(q->wait_pq, 0, sizeof(q->wait_pq));
1060	memset(q->ptr, 0, sizeof(q->ptr));
1061	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1062		INIT_LIST_HEAD(q->drops + i);
1063}
1064
1065static int htb_init(struct Qdisc *sch, struct rtattr *opt)
1066{
1067	struct htb_sched *q = qdisc_priv(sch);
1068	struct rtattr *tb[TCA_HTB_INIT];
1069	struct tc_htb_glob *gopt;
1070	int i;
1071	if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) ||
1072	    tb[TCA_HTB_INIT - 1] == NULL ||
1073	    RTA_PAYLOAD(tb[TCA_HTB_INIT - 1]) < sizeof(*gopt)) {
1074		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
1075		return -EINVAL;
1076	}
1077	gopt = RTA_DATA(tb[TCA_HTB_INIT - 1]);
1078	if (gopt->version != HTB_VER >> 16) {
1079		printk(KERN_ERR
1080		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1081		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1082		return -EINVAL;
1083	}
1084
1085	INIT_LIST_HEAD(&q->root);
1086	for (i = 0; i < HTB_HSIZE; i++)
1087		INIT_HLIST_HEAD(q->hash + i);
1088	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1089		INIT_LIST_HEAD(q->drops + i);
1090
1091	qdisc_watchdog_init(&q->watchdog, sch);
1092	skb_queue_head_init(&q->direct_queue);
1093
1094	q->direct_qlen = sch->dev->tx_queue_len;
1095	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1096		q->direct_qlen = 2;
1097
1098#ifdef HTB_RATECM
1099	init_timer(&q->rttim);
1100	q->rttim.function = htb_rate_timer;
1101	q->rttim.data = (unsigned long)sch;
1102	q->rttim.expires = jiffies + HZ;
1103	add_timer(&q->rttim);
1104#endif
1105	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1106		q->rate2quantum = 1;
1107	q->defcls = gopt->defcls;
1108
1109	return 0;
1110}
1111
1112static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1113{
1114	struct htb_sched *q = qdisc_priv(sch);
1115	unsigned char *b = skb_tail_pointer(skb);
1116	struct rtattr *rta;
1117	struct tc_htb_glob gopt;
1118	spin_lock_bh(&sch->dev->queue_lock);
1119	gopt.direct_pkts = q->direct_pkts;
1120
1121	gopt.version = HTB_VER;
1122	gopt.rate2quantum = q->rate2quantum;
1123	gopt.defcls = q->defcls;
1124	gopt.debug = 0;
1125	rta = (struct rtattr *)b;
1126	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1127	RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1128	rta->rta_len = skb_tail_pointer(skb) - b;
1129	spin_unlock_bh(&sch->dev->queue_lock);
1130	return skb->len;
1131rtattr_failure:
1132	spin_unlock_bh(&sch->dev->queue_lock);
1133	nlmsg_trim(skb, skb_tail_pointer(skb));
1134	return -1;
1135}
1136
1137static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1138			  struct sk_buff *skb, struct tcmsg *tcm)
1139{
1140	struct htb_class *cl = (struct htb_class *)arg;
1141	unsigned char *b = skb_tail_pointer(skb);
1142	struct rtattr *rta;
1143	struct tc_htb_opt opt;
1144
1145	spin_lock_bh(&sch->dev->queue_lock);
1146	tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT;
1147	tcm->tcm_handle = cl->classid;
1148	if (!cl->level && cl->un.leaf.q)
1149		tcm->tcm_info = cl->un.leaf.q->handle;
1150
1151	rta = (struct rtattr *)b;
1152	RTA_PUT(skb, TCA_OPTIONS, 0, NULL);
1153
1154	memset(&opt, 0, sizeof(opt));
1155
1156	opt.rate = cl->rate->rate;
1157	opt.buffer = cl->buffer;
1158	opt.ceil = cl->ceil->rate;
1159	opt.cbuffer = cl->cbuffer;
1160	opt.quantum = cl->un.leaf.quantum;
1161	opt.prio = cl->un.leaf.prio;
1162	opt.level = cl->level;
1163	RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1164	rta->rta_len = skb_tail_pointer(skb) - b;
1165	spin_unlock_bh(&sch->dev->queue_lock);
1166	return skb->len;
1167rtattr_failure:
1168	spin_unlock_bh(&sch->dev->queue_lock);
1169	nlmsg_trim(skb, b);
1170	return -1;
1171}
1172
1173static int
1174htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1175{
1176	struct htb_class *cl = (struct htb_class *)arg;
1177
1178#ifdef HTB_RATECM
1179	cl->rate_est.bps = cl->rate_bytes / (HTB_EWMAC * HTB_HSIZE);
1180	cl->rate_est.pps = cl->rate_packets / (HTB_EWMAC * HTB_HSIZE);
1181#endif
1182
1183	if (!cl->level && cl->un.leaf.q)
1184		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1185	cl->xstats.tokens = cl->tokens;
1186	cl->xstats.ctokens = cl->ctokens;
1187
1188	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1189	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1190	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1191		return -1;
1192
1193	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1194}
1195
1196static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1197		     struct Qdisc **old)
1198{
1199	struct htb_class *cl = (struct htb_class *)arg;
1200
1201	if (cl && !cl->level) {
1202		if (new == NULL &&
1203		    (new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1204					     cl->classid))
1205		    == NULL)
1206			return -ENOBUFS;
1207		sch_tree_lock(sch);
1208		if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) {
1209			qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1210			qdisc_reset(*old);
1211		}
1212		sch_tree_unlock(sch);
1213		return 0;
1214	}
1215	return -ENOENT;
1216}
1217
1218static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1219{
1220	struct htb_class *cl = (struct htb_class *)arg;
1221	return (cl && !cl->level) ? cl->un.leaf.q : NULL;
1222}
1223
1224static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1225{
1226	struct htb_class *cl = (struct htb_class *)arg;
1227
1228	if (cl->un.leaf.q->q.qlen == 0)
1229		htb_deactivate(qdisc_priv(sch), cl);
1230}
1231
1232static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1233{
1234	struct htb_class *cl = htb_find(classid, sch);
1235	if (cl)
1236		cl->refcnt++;
1237	return (unsigned long)cl;
1238}
1239
1240static inline int htb_parent_last_child(struct htb_class *cl)
1241{
1242	if (!cl->parent)
1243		/* the root class */
1244		return 0;
1245
1246	if (!(cl->parent->children.next == &cl->sibling &&
1247		cl->parent->children.prev == &cl->sibling))
1248		/* not the last child */
1249		return 0;
1250
1251	return 1;
1252}
1253
1254static void htb_parent_to_leaf(struct htb_class *cl, struct Qdisc *new_q)
1255{
1256	struct htb_class *parent = cl->parent;
1257
1258	BUG_TRAP(!cl->level && cl->un.leaf.q && !cl->prio_activity);
1259
1260	parent->level = 0;
1261	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1262	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1263	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1264	parent->un.leaf.quantum = parent->quantum;
1265	parent->un.leaf.prio = parent->prio;
1266	parent->tokens = parent->buffer;
1267	parent->ctokens = parent->cbuffer;
1268	parent->t_c = psched_get_time();
1269	parent->cmode = HTB_CAN_SEND;
1270}
1271
1272static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1273{
1274	struct htb_sched *q = qdisc_priv(sch);
1275
1276	if (!cl->level) {
1277		BUG_TRAP(cl->un.leaf.q);
1278		qdisc_destroy(cl->un.leaf.q);
1279	}
1280	qdisc_put_rtab(cl->rate);
1281	qdisc_put_rtab(cl->ceil);
1282
1283	tcf_destroy_chain(cl->filter_list);
1284
1285	while (!list_empty(&cl->children))
1286		htb_destroy_class(sch, list_entry(cl->children.next,
1287						  struct htb_class, sibling));
1288
1289	/* note: this delete may happen twice (see htb_delete) */
1290	hlist_del_init(&cl->hlist);
1291	list_del(&cl->sibling);
1292
1293	if (cl->prio_activity)
1294		htb_deactivate(q, cl);
1295
1296	if (cl->cmode != HTB_CAN_SEND)
1297		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1298
1299	kfree(cl);
1300}
1301
1302/* always caled under BH & queue lock */
1303static void htb_destroy(struct Qdisc *sch)
1304{
1305	struct htb_sched *q = qdisc_priv(sch);
1306
1307	qdisc_watchdog_cancel(&q->watchdog);
1308#ifdef HTB_RATECM
1309	del_timer_sync(&q->rttim);
1310#endif
1311	/* This line used to be after htb_destroy_class call below
1312	   and surprisingly it worked in 2.4. But it must precede it
1313	   because filter need its target class alive to be able to call
1314	   unbind_filter on it (without Oops). */
1315	tcf_destroy_chain(q->filter_list);
1316
1317	while (!list_empty(&q->root))
1318		htb_destroy_class(sch, list_entry(q->root.next,
1319						  struct htb_class, sibling));
1320
1321	__skb_queue_purge(&q->direct_queue);
1322}
1323
1324static int htb_delete(struct Qdisc *sch, unsigned long arg)
1325{
1326	struct htb_sched *q = qdisc_priv(sch);
1327	struct htb_class *cl = (struct htb_class *)arg;
1328	unsigned int qlen;
1329	struct Qdisc *new_q = NULL;
1330	int last_child = 0;
1331
1332	// TODO: why don't allow to delete subtree ? references ? does
1333	// tc subsys quarantee us that in htb_destroy it holds no class
1334	// refs so that we can remove children safely there ?
1335	if (!list_empty(&cl->children) || cl->filter_cnt)
1336		return -EBUSY;
1337
1338	if (!cl->level && htb_parent_last_child(cl)) {
1339		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops,
1340						cl->parent->classid);
1341		last_child = 1;
1342	}
1343
1344	sch_tree_lock(sch);
1345
1346	if (!cl->level) {
1347		qlen = cl->un.leaf.q->q.qlen;
1348		qdisc_reset(cl->un.leaf.q);
1349		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1350	}
1351
1352	/* delete from hash and active; remainder in destroy_class */
1353	hlist_del_init(&cl->hlist);
1354
1355	if (cl->prio_activity)
1356		htb_deactivate(q, cl);
1357
1358	if (last_child)
1359		htb_parent_to_leaf(cl, new_q);
1360
1361	if (--cl->refcnt == 0)
1362		htb_destroy_class(sch, cl);
1363
1364	sch_tree_unlock(sch);
1365	return 0;
1366}
1367
1368static void htb_put(struct Qdisc *sch, unsigned long arg)
1369{
1370	struct htb_class *cl = (struct htb_class *)arg;
1371
1372	if (--cl->refcnt == 0)
1373		htb_destroy_class(sch, cl);
1374}
1375
1376static int htb_change_class(struct Qdisc *sch, u32 classid,
1377			    u32 parentid, struct rtattr **tca,
1378			    unsigned long *arg)
1379{
1380	int err = -EINVAL;
1381	struct htb_sched *q = qdisc_priv(sch);
1382	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1383	struct rtattr *opt = tca[TCA_OPTIONS - 1];
1384	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1385	struct rtattr *tb[TCA_HTB_RTAB];
1386	struct tc_htb_opt *hopt;
1387
1388	/* extract all subattrs from opt attr */
1389	if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) ||
1390	    tb[TCA_HTB_PARMS - 1] == NULL ||
1391	    RTA_PAYLOAD(tb[TCA_HTB_PARMS - 1]) < sizeof(*hopt))
1392		goto failure;
1393
1394	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1395
1396	hopt = RTA_DATA(tb[TCA_HTB_PARMS - 1]);
1397
1398	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB - 1]);
1399	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB - 1]);
1400	if (!rtab || !ctab)
1401		goto failure;
1402
1403	if (!cl) {		/* new class */
1404		struct Qdisc *new_q;
1405		int prio;
1406
1407		/* check for valid classid */
1408		if (!classid || TC_H_MAJ(classid ^ sch->handle)
1409		    || htb_find(classid, sch))
1410			goto failure;
1411
1412		/* check maximal depth */
1413		if (parent && parent->parent && parent->parent->level < 2) {
1414			printk(KERN_ERR "htb: tree is too deep\n");
1415			goto failure;
1416		}
1417		err = -ENOBUFS;
1418		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1419			goto failure;
1420
1421		cl->refcnt = 1;
1422		INIT_LIST_HEAD(&cl->sibling);
1423		INIT_HLIST_NODE(&cl->hlist);
1424		INIT_LIST_HEAD(&cl->children);
1425		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1426		RB_CLEAR_NODE(&cl->pq_node);
1427
1428		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1429			RB_CLEAR_NODE(&cl->node[prio]);
1430
1431		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1432		   so that can't be used inside of sch_tree_lock
1433		   -- thanks to Karlis Peisenieks */
1434		new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops, classid);
1435		sch_tree_lock(sch);
1436		if (parent && !parent->level) {
1437			unsigned int qlen = parent->un.leaf.q->q.qlen;
1438
1439			/* turn parent into inner node */
1440			qdisc_reset(parent->un.leaf.q);
1441			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1442			qdisc_destroy(parent->un.leaf.q);
1443			if (parent->prio_activity)
1444				htb_deactivate(q, parent);
1445
1446			/* remove from evt list because of level change */
1447			if (parent->cmode != HTB_CAN_SEND) {
1448				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1449				parent->cmode = HTB_CAN_SEND;
1450			}
1451			parent->level = (parent->parent ? parent->parent->level
1452					 : TC_HTB_MAXDEPTH) - 1;
1453			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1454		}
1455		/* leaf (we) needs elementary qdisc */
1456		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1457
1458		cl->classid = classid;
1459		cl->parent = parent;
1460
1461		/* set class to be in HTB_CAN_SEND state */
1462		cl->tokens = hopt->buffer;
1463		cl->ctokens = hopt->cbuffer;
1464		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1465		cl->t_c = psched_get_time();
1466		cl->cmode = HTB_CAN_SEND;
1467
1468		/* attach to the hash list and parent's family */
1469		hlist_add_head(&cl->hlist, q->hash + htb_hash(classid));
1470		list_add_tail(&cl->sibling,
1471			      parent ? &parent->children : &q->root);
1472	} else
1473		sch_tree_lock(sch);
1474
1475	/* it used to be a nasty bug here, we have to check that node
1476	   is really leaf before changing cl->un.leaf ! */
1477	if (!cl->level) {
1478		cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum;
1479		if (!hopt->quantum && cl->un.leaf.quantum < 1000) {
1480			printk(KERN_WARNING
1481			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1482			       cl->classid);
1483			cl->un.leaf.quantum = 1000;
1484		}
1485		if (!hopt->quantum && cl->un.leaf.quantum > 200000) {
1486			printk(KERN_WARNING
1487			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1488			       cl->classid);
1489			cl->un.leaf.quantum = 200000;
1490		}
1491		if (hopt->quantum)
1492			cl->un.leaf.quantum = hopt->quantum;
1493		if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO)
1494			cl->un.leaf.prio = TC_HTB_NUMPRIO - 1;
1495
1496		/* backup for htb_parent_to_leaf */
1497		cl->quantum = cl->un.leaf.quantum;
1498		cl->prio = cl->un.leaf.prio;
1499	}
1500
1501	cl->buffer = hopt->buffer;
1502	cl->cbuffer = hopt->cbuffer;
1503	if (cl->rate)
1504		qdisc_put_rtab(cl->rate);
1505	cl->rate = rtab;
1506	if (cl->ceil)
1507		qdisc_put_rtab(cl->ceil);
1508	cl->ceil = ctab;
1509	sch_tree_unlock(sch);
1510
1511	*arg = (unsigned long)cl;
1512	return 0;
1513
1514failure:
1515	if (rtab)
1516		qdisc_put_rtab(rtab);
1517	if (ctab)
1518		qdisc_put_rtab(ctab);
1519	return err;
1520}
1521
1522static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1523{
1524	struct htb_sched *q = qdisc_priv(sch);
1525	struct htb_class *cl = (struct htb_class *)arg;
1526	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1527
1528	return fl;
1529}
1530
1531static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1532				     u32 classid)
1533{
1534	struct htb_sched *q = qdisc_priv(sch);
1535	struct htb_class *cl = htb_find(classid, sch);
1536
1537	/*if (cl && !cl->level) return 0;
1538	   The line above used to be there to prevent attaching filters to
1539	   leaves. But at least tc_index filter uses this just to get class
1540	   for other reasons so that we have to allow for it.
1541	   ----
1542	   19.6.2002 As Werner explained it is ok - bind filter is just
1543	   another way to "lock" the class - unlike "get" this lock can
1544	   be broken by class during destroy IIUC.
1545	 */
1546	if (cl)
1547		cl->filter_cnt++;
1548	else
1549		q->filter_cnt++;
1550	return (unsigned long)cl;
1551}
1552
1553static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1554{
1555	struct htb_sched *q = qdisc_priv(sch);
1556	struct htb_class *cl = (struct htb_class *)arg;
1557
1558	if (cl)
1559		cl->filter_cnt--;
1560	else
1561		q->filter_cnt--;
1562}
1563
1564static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1565{
1566	struct htb_sched *q = qdisc_priv(sch);
1567	int i;
1568
1569	if (arg->stop)
1570		return;
1571
1572	for (i = 0; i < HTB_HSIZE; i++) {
1573		struct hlist_node *p;
1574		struct htb_class *cl;
1575
1576		hlist_for_each_entry(cl, p, q->hash + i, hlist) {
1577			if (arg->count < arg->skip) {
1578				arg->count++;
1579				continue;
1580			}
1581			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1582				arg->stop = 1;
1583				return;
1584			}
1585			arg->count++;
1586		}
1587	}
1588}
1589
1590static struct Qdisc_class_ops htb_class_ops = {
1591	.graft		=	htb_graft,
1592	.leaf		=	htb_leaf,
1593	.qlen_notify	=	htb_qlen_notify,
1594	.get		=	htb_get,
1595	.put		=	htb_put,
1596	.change		=	htb_change_class,
1597	.delete		=	htb_delete,
1598	.walk		=	htb_walk,
1599	.tcf_chain	=	htb_find_tcf,
1600	.bind_tcf	=	htb_bind_filter,
1601	.unbind_tcf	=	htb_unbind_filter,
1602	.dump		=	htb_dump_class,
1603	.dump_stats	=	htb_dump_class_stats,
1604};
1605
1606static struct Qdisc_ops htb_qdisc_ops = {
1607	.next		=	NULL,
1608	.cl_ops		=	&htb_class_ops,
1609	.id		=	"htb",
1610	.priv_size	=	sizeof(struct htb_sched),
1611	.enqueue	=	htb_enqueue,
1612	.dequeue	=	htb_dequeue,
1613	.requeue	=	htb_requeue,
1614	.drop		=	htb_drop,
1615	.init		=	htb_init,
1616	.reset		=	htb_reset,
1617	.destroy	=	htb_destroy,
1618	.change		=	NULL /* htb_change */,
1619	.dump		=	htb_dump,
1620	.owner		=	THIS_MODULE,
1621};
1622
1623static int __init htb_module_init(void)
1624{
1625	return register_qdisc(&htb_qdisc_ops);
1626}
1627static void __exit htb_module_exit(void)
1628{
1629	unregister_qdisc(&htb_qdisc_ops);
1630}
1631
1632module_init(htb_module_init)
1633module_exit(htb_module_exit)
1634MODULE_LICENSE("GPL");
1635