1/*
2 * net/sched/sch_htb.c	Hierarchical token bucket, feed tree version
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Martin Devera, <devik@cdi.cz>
10 *
11 * Credits (in time order) for older HTB versions:
12 *              Stef Coene <stef.coene@docum.org>
13 *			HTB support at LARTC mailing list
14 *		Ondrej Kraus, <krauso@barr.cz>
15 *			found missing INIT_QDISC(htb)
16 *		Vladimir Smelhaus, Aamer Akhter, Bert Hubert
17 *			helped a lot to locate nasty class stall bug
18 *		Andi Kleen, Jamal Hadi, Bert Hubert
19 *			code review and helpful comments on shaping
20 *		Tomasz Wrona, <tw@eter.tym.pl>
21 *			created test case so that I was able to fix nasty bug
22 *		Wilfried Weissmann
23 *			spotted bug in dequeue code and helped with fix
24 *		Jiri Fojtasek
25 *			fixed requeue routine
26 *		and many others. thanks.
27 */
28#include <linux/module.h>
29#include <linux/moduleparam.h>
30#include <linux/types.h>
31#include <linux/kernel.h>
32#include <linux/string.h>
33#include <linux/errno.h>
34#include <linux/skbuff.h>
35#include <linux/list.h>
36#include <linux/compiler.h>
37#include <linux/rbtree.h>
38#include <linux/workqueue.h>
39#include <linux/slab.h>
40#include <net/netlink.h>
41#include <net/pkt_sched.h>
42
43/* HTB algorithm.
44    Author: devik@cdi.cz
45    ========================================================================
46    HTB is like TBF with multiple classes. It is also similar to CBQ because
47    it allows to assign priority to each class in hierarchy.
48    In fact it is another implementation of Floyd's formal sharing.
49
50    Levels:
51    Each class is assigned level. Leaf has ALWAYS level 0 and root
52    classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level
53    one less than their parent.
54*/
55
56static int htb_hysteresis __read_mostly = 0; /* whether to use mode hysteresis for speedup */
57#define HTB_VER 0x30011		/* major must be matched with number suplied by TC as version */
58
59#if HTB_VER >> 16 != TC_HTB_PROTOVER
60#error "Mismatched sch_htb.c and pkt_sch.h"
61#endif
62
63/* Module parameter and sysfs export */
64module_param    (htb_hysteresis, int, 0640);
65MODULE_PARM_DESC(htb_hysteresis, "Hysteresis mode, less CPU load, less accurate");
66
67/* used internaly to keep status of single class */
68enum htb_cmode {
69	HTB_CANT_SEND,		/* class can't send and can't borrow */
70	HTB_MAY_BORROW,		/* class can't send but may borrow */
71	HTB_CAN_SEND		/* class can send */
72};
73
74/* interior & leaf nodes; props specific to leaves are marked L: */
75struct htb_class {
76	struct Qdisc_class_common common;
77	/* general class parameters */
78	struct gnet_stats_basic_packed bstats;
79	struct gnet_stats_queue qstats;
80	struct gnet_stats_rate_est rate_est;
81	struct tc_htb_xstats xstats;	/* our special stats */
82	int refcnt;		/* usage count of this class */
83
84	/* topology */
85	int level;		/* our level (see above) */
86	unsigned int children;
87	struct htb_class *parent;	/* parent class */
88
89	int prio;		/* these two are used only by leaves... */
90	int quantum;		/* but stored for parent-to-leaf return */
91
92	union {
93		struct htb_class_leaf {
94			struct Qdisc *q;
95			int deficit[TC_HTB_MAXDEPTH];
96			struct list_head drop_list;
97		} leaf;
98		struct htb_class_inner {
99			struct rb_root feed[TC_HTB_NUMPRIO];	/* feed trees */
100			struct rb_node *ptr[TC_HTB_NUMPRIO];	/* current class ptr */
101			/* When class changes from state 1->2 and disconnects from
102			   parent's feed then we lost ptr value and start from the
103			   first child again. Here we store classid of the
104			   last valid ptr (used when ptr is NULL). */
105			u32 last_ptr_id[TC_HTB_NUMPRIO];
106		} inner;
107	} un;
108	struct rb_node node[TC_HTB_NUMPRIO];	/* node for self or feed tree */
109	struct rb_node pq_node;	/* node for event queue */
110	psched_time_t pq_key;
111
112	int prio_activity;	/* for which prios are we active */
113	enum htb_cmode cmode;	/* current mode of the class */
114
115	/* class attached filters */
116	struct tcf_proto *filter_list;
117	int filter_cnt;
118
119	/* token bucket parameters */
120	struct qdisc_rate_table *rate;	/* rate table of the class itself */
121	struct qdisc_rate_table *ceil;	/* ceiling rate (limits borrows too) */
122	long buffer, cbuffer;	/* token bucket depth/rate */
123	psched_tdiff_t mbuffer;	/* max wait time */
124	long tokens, ctokens;	/* current number of tokens */
125	psched_time_t t_c;	/* checkpoint time */
126};
127
128struct htb_sched {
129	struct Qdisc_class_hash clhash;
130	struct list_head drops[TC_HTB_NUMPRIO];/* active leaves (for drops) */
131
132	/* self list - roots of self generating tree */
133	struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
134	int row_mask[TC_HTB_MAXDEPTH];
135	struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
136	u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO];
137
138	/* self wait list - roots of wait PQs per row */
139	struct rb_root wait_pq[TC_HTB_MAXDEPTH];
140
141	/* time of nearest event per level (row) */
142	psched_time_t near_ev_cache[TC_HTB_MAXDEPTH];
143
144	int defcls;		/* class where unclassified flows go to */
145
146	/* filters for qdisc itself */
147	struct tcf_proto *filter_list;
148
149	int rate2quantum;	/* quant = rate / rate2quantum */
150	psched_time_t now;	/* cached dequeue time */
151	struct qdisc_watchdog watchdog;
152
153	/* non shaped skbs; let them go directly thru */
154	struct sk_buff_head direct_queue;
155	int direct_qlen;	/* max qlen of above */
156
157	long direct_pkts;
158
159#define HTB_WARN_TOOMANYEVENTS	0x1
160	unsigned int warned;	/* only one warning */
161	struct work_struct work;
162};
163
164/* find class in global hash table using given handle */
165static inline struct htb_class *htb_find(u32 handle, struct Qdisc *sch)
166{
167	struct htb_sched *q = qdisc_priv(sch);
168	struct Qdisc_class_common *clc;
169
170	clc = qdisc_class_find(&q->clhash, handle);
171	if (clc == NULL)
172		return NULL;
173	return container_of(clc, struct htb_class, common);
174}
175
176/**
177 * htb_classify - classify a packet into class
178 *
179 * It returns NULL if the packet should be dropped or -1 if the packet
180 * should be passed directly thru. In all other cases leaf class is returned.
181 * We allow direct class selection by classid in priority. The we examine
182 * filters in qdisc and in inner nodes (if higher filter points to the inner
183 * node). If we end up with classid MAJOR:0 we enqueue the skb into special
184 * internal fifo (direct). These packets then go directly thru. If we still
185 * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull
186 * then finish and return direct queue.
187 */
188#define HTB_DIRECT (struct htb_class*)-1
189
190static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch,
191				      int *qerr)
192{
193	struct htb_sched *q = qdisc_priv(sch);
194	struct htb_class *cl;
195	struct tcf_result res;
196	struct tcf_proto *tcf;
197	int result;
198
199	/* allow to select class by setting skb->priority to valid classid;
200	   note that nfmark can be used too by attaching filter fw with no
201	   rules in it */
202	if (skb->priority == sch->handle)
203		return HTB_DIRECT;	/* X:0 (direct flow) selected */
204	if ((cl = htb_find(skb->priority, sch)) != NULL && cl->level == 0)
205		return cl;
206
207	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
208	tcf = q->filter_list;
209	while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) {
210#ifdef CONFIG_NET_CLS_ACT
211		switch (result) {
212		case TC_ACT_QUEUED:
213		case TC_ACT_STOLEN:
214			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
215		case TC_ACT_SHOT:
216			return NULL;
217		}
218#endif
219		if ((cl = (void *)res.class) == NULL) {
220			if (res.classid == sch->handle)
221				return HTB_DIRECT;	/* X:0 (direct flow) */
222			if ((cl = htb_find(res.classid, sch)) == NULL)
223				break;	/* filter selected invalid classid */
224		}
225		if (!cl->level)
226			return cl;	/* we hit leaf; return it */
227
228		/* we have got inner class; apply inner filter chain */
229		tcf = cl->filter_list;
230	}
231	/* classification failed; try to use default class */
232	cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch);
233	if (!cl || cl->level)
234		return HTB_DIRECT;	/* bad default .. this is safe bet */
235	return cl;
236}
237
238/**
239 * htb_add_to_id_tree - adds class to the round robin list
240 *
241 * Routine adds class to the list (actually tree) sorted by classid.
242 * Make sure that class is not already on such list for given prio.
243 */
244static void htb_add_to_id_tree(struct rb_root *root,
245			       struct htb_class *cl, int prio)
246{
247	struct rb_node **p = &root->rb_node, *parent = NULL;
248
249	while (*p) {
250		struct htb_class *c;
251		parent = *p;
252		c = rb_entry(parent, struct htb_class, node[prio]);
253
254		if (cl->common.classid > c->common.classid)
255			p = &parent->rb_right;
256		else
257			p = &parent->rb_left;
258	}
259	rb_link_node(&cl->node[prio], parent, p);
260	rb_insert_color(&cl->node[prio], root);
261}
262
263/**
264 * htb_add_to_wait_tree - adds class to the event queue with delay
265 *
266 * The class is added to priority event queue to indicate that class will
267 * change its mode in cl->pq_key microseconds. Make sure that class is not
268 * already in the queue.
269 */
270static void htb_add_to_wait_tree(struct htb_sched *q,
271				 struct htb_class *cl, long delay)
272{
273	struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL;
274
275	cl->pq_key = q->now + delay;
276	if (cl->pq_key == q->now)
277		cl->pq_key++;
278
279	/* update the nearest event cache */
280	if (q->near_ev_cache[cl->level] > cl->pq_key)
281		q->near_ev_cache[cl->level] = cl->pq_key;
282
283	while (*p) {
284		struct htb_class *c;
285		parent = *p;
286		c = rb_entry(parent, struct htb_class, pq_node);
287		if (cl->pq_key >= c->pq_key)
288			p = &parent->rb_right;
289		else
290			p = &parent->rb_left;
291	}
292	rb_link_node(&cl->pq_node, parent, p);
293	rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]);
294}
295
296/**
297 * htb_next_rb_node - finds next node in binary tree
298 *
299 * When we are past last key we return NULL.
300 * Average complexity is 2 steps per call.
301 */
302static inline void htb_next_rb_node(struct rb_node **n)
303{
304	*n = rb_next(*n);
305}
306
307/**
308 * htb_add_class_to_row - add class to its row
309 *
310 * The class is added to row at priorities marked in mask.
311 * It does nothing if mask == 0.
312 */
313static inline void htb_add_class_to_row(struct htb_sched *q,
314					struct htb_class *cl, int mask)
315{
316	q->row_mask[cl->level] |= mask;
317	while (mask) {
318		int prio = ffz(~mask);
319		mask &= ~(1 << prio);
320		htb_add_to_id_tree(q->row[cl->level] + prio, cl, prio);
321	}
322}
323
324/* If this triggers, it is a bug in this code, but it need not be fatal */
325static void htb_safe_rb_erase(struct rb_node *rb, struct rb_root *root)
326{
327	if (RB_EMPTY_NODE(rb)) {
328		WARN_ON(1);
329	} else {
330		rb_erase(rb, root);
331		RB_CLEAR_NODE(rb);
332	}
333}
334
335
336/**
337 * htb_remove_class_from_row - removes class from its row
338 *
339 * The class is removed from row at priorities marked in mask.
340 * It does nothing if mask == 0.
341 */
342static inline void htb_remove_class_from_row(struct htb_sched *q,
343						 struct htb_class *cl, int mask)
344{
345	int m = 0;
346
347	while (mask) {
348		int prio = ffz(~mask);
349
350		mask &= ~(1 << prio);
351		if (q->ptr[cl->level][prio] == cl->node + prio)
352			htb_next_rb_node(q->ptr[cl->level] + prio);
353
354		htb_safe_rb_erase(cl->node + prio, q->row[cl->level] + prio);
355		if (!q->row[cl->level][prio].rb_node)
356			m |= 1 << prio;
357	}
358	q->row_mask[cl->level] &= ~m;
359}
360
361/**
362 * htb_activate_prios - creates active classe's feed chain
363 *
364 * The class is connected to ancestors and/or appropriate rows
365 * for priorities it is participating on. cl->cmode must be new
366 * (activated) mode. It does nothing if cl->prio_activity == 0.
367 */
368static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl)
369{
370	struct htb_class *p = cl->parent;
371	long m, mask = cl->prio_activity;
372
373	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
374		m = mask;
375		while (m) {
376			int prio = ffz(~m);
377			m &= ~(1 << prio);
378
379			if (p->un.inner.feed[prio].rb_node)
380				/* parent already has its feed in use so that
381				   reset bit in mask as parent is already ok */
382				mask &= ~(1 << prio);
383
384			htb_add_to_id_tree(p->un.inner.feed + prio, cl, prio);
385		}
386		p->prio_activity |= mask;
387		cl = p;
388		p = cl->parent;
389
390	}
391	if (cl->cmode == HTB_CAN_SEND && mask)
392		htb_add_class_to_row(q, cl, mask);
393}
394
395/**
396 * htb_deactivate_prios - remove class from feed chain
397 *
398 * cl->cmode must represent old mode (before deactivation). It does
399 * nothing if cl->prio_activity == 0. Class is removed from all feed
400 * chains and rows.
401 */
402static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl)
403{
404	struct htb_class *p = cl->parent;
405	long m, mask = cl->prio_activity;
406
407	while (cl->cmode == HTB_MAY_BORROW && p && mask) {
408		m = mask;
409		mask = 0;
410		while (m) {
411			int prio = ffz(~m);
412			m &= ~(1 << prio);
413
414			if (p->un.inner.ptr[prio] == cl->node + prio) {
415				/* we are removing child which is pointed to from
416				   parent feed - forget the pointer but remember
417				   classid */
418				p->un.inner.last_ptr_id[prio] = cl->common.classid;
419				p->un.inner.ptr[prio] = NULL;
420			}
421
422			htb_safe_rb_erase(cl->node + prio, p->un.inner.feed + prio);
423
424			if (!p->un.inner.feed[prio].rb_node)
425				mask |= 1 << prio;
426		}
427
428		p->prio_activity &= ~mask;
429		cl = p;
430		p = cl->parent;
431
432	}
433	if (cl->cmode == HTB_CAN_SEND && mask)
434		htb_remove_class_from_row(q, cl, mask);
435}
436
437static inline long htb_lowater(const struct htb_class *cl)
438{
439	if (htb_hysteresis)
440		return cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : 0;
441	else
442		return 0;
443}
444static inline long htb_hiwater(const struct htb_class *cl)
445{
446	if (htb_hysteresis)
447		return cl->cmode == HTB_CAN_SEND ? -cl->buffer : 0;
448	else
449		return 0;
450}
451
452
453/**
454 * htb_class_mode - computes and returns current class mode
455 *
456 * It computes cl's mode at time cl->t_c+diff and returns it. If mode
457 * is not HTB_CAN_SEND then cl->pq_key is updated to time difference
458 * from now to time when cl will change its state.
459 * Also it is worth to note that class mode doesn't change simply
460 * at cl->{c,}tokens == 0 but there can rather be hysteresis of
461 * 0 .. -cl->{c,}buffer range. It is meant to limit number of
462 * mode transitions per time unit. The speed gain is about 1/6.
463 */
464static inline enum htb_cmode
465htb_class_mode(struct htb_class *cl, long *diff)
466{
467	long toks;
468
469	if ((toks = (cl->ctokens + *diff)) < htb_lowater(cl)) {
470		*diff = -toks;
471		return HTB_CANT_SEND;
472	}
473
474	if ((toks = (cl->tokens + *diff)) >= htb_hiwater(cl))
475		return HTB_CAN_SEND;
476
477	*diff = -toks;
478	return HTB_MAY_BORROW;
479}
480
481/**
482 * htb_change_class_mode - changes classe's mode
483 *
484 * This should be the only way how to change classe's mode under normal
485 * cirsumstances. Routine will update feed lists linkage, change mode
486 * and add class to the wait event queue if appropriate. New mode should
487 * be different from old one and cl->pq_key has to be valid if changing
488 * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree).
489 */
490static void
491htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff)
492{
493	enum htb_cmode new_mode = htb_class_mode(cl, diff);
494
495	if (new_mode == cl->cmode)
496		return;
497
498	if (cl->prio_activity) {	/* not necessary: speed optimization */
499		if (cl->cmode != HTB_CANT_SEND)
500			htb_deactivate_prios(q, cl);
501		cl->cmode = new_mode;
502		if (new_mode != HTB_CANT_SEND)
503			htb_activate_prios(q, cl);
504	} else
505		cl->cmode = new_mode;
506}
507
508/**
509 * htb_activate - inserts leaf cl into appropriate active feeds
510 *
511 * Routine learns (new) priority of leaf and activates feed chain
512 * for the prio. It can be called on already active leaf safely.
513 * It also adds leaf into droplist.
514 */
515static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
516{
517	WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
518
519	if (!cl->prio_activity) {
520		cl->prio_activity = 1 << cl->prio;
521		htb_activate_prios(q, cl);
522		list_add_tail(&cl->un.leaf.drop_list,
523			      q->drops + cl->prio);
524	}
525}
526
527/**
528 * htb_deactivate - remove leaf cl from active feeds
529 *
530 * Make sure that leaf is active. In the other words it can't be called
531 * with non-active leaf. It also removes class from the drop list.
532 */
533static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl)
534{
535	WARN_ON(!cl->prio_activity);
536
537	htb_deactivate_prios(q, cl);
538	cl->prio_activity = 0;
539	list_del_init(&cl->un.leaf.drop_list);
540}
541
542static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
543{
544	int uninitialized_var(ret);
545	struct htb_sched *q = qdisc_priv(sch);
546	struct htb_class *cl = htb_classify(skb, sch, &ret);
547
548	if (cl == HTB_DIRECT) {
549		/* enqueue to helper queue */
550		if (q->direct_queue.qlen < q->direct_qlen) {
551			__skb_queue_tail(&q->direct_queue, skb);
552			q->direct_pkts++;
553		} else {
554			kfree_skb(skb);
555			sch->qstats.drops++;
556			return NET_XMIT_DROP;
557		}
558#ifdef CONFIG_NET_CLS_ACT
559	} else if (!cl) {
560		if (ret & __NET_XMIT_BYPASS)
561			sch->qstats.drops++;
562		kfree_skb(skb);
563		return ret;
564#endif
565	} else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q)) != NET_XMIT_SUCCESS) {
566		if (net_xmit_drop_count(ret)) {
567			sch->qstats.drops++;
568			cl->qstats.drops++;
569		}
570		return ret;
571	} else {
572		cl->bstats.packets +=
573			skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
574		cl->bstats.bytes += qdisc_pkt_len(skb);
575		htb_activate(q, cl);
576	}
577
578	sch->q.qlen++;
579	sch->bstats.packets += skb_is_gso(skb)?skb_shinfo(skb)->gso_segs:1;
580	sch->bstats.bytes += qdisc_pkt_len(skb);
581	return NET_XMIT_SUCCESS;
582}
583
584static inline void htb_accnt_tokens(struct htb_class *cl, int bytes, long diff)
585{
586	long toks = diff + cl->tokens;
587
588	if (toks > cl->buffer)
589		toks = cl->buffer;
590	toks -= (long) qdisc_l2t(cl->rate, bytes);
591	if (toks <= -cl->mbuffer)
592		toks = 1 - cl->mbuffer;
593
594	cl->tokens = toks;
595}
596
597static inline void htb_accnt_ctokens(struct htb_class *cl, int bytes, long diff)
598{
599	long toks = diff + cl->ctokens;
600
601	if (toks > cl->cbuffer)
602		toks = cl->cbuffer;
603	toks -= (long) qdisc_l2t(cl->ceil, bytes);
604	if (toks <= -cl->mbuffer)
605		toks = 1 - cl->mbuffer;
606
607	cl->ctokens = toks;
608}
609
610/**
611 * htb_charge_class - charges amount "bytes" to leaf and ancestors
612 *
613 * Routine assumes that packet "bytes" long was dequeued from leaf cl
614 * borrowing from "level". It accounts bytes to ceil leaky bucket for
615 * leaf and all ancestors and to rate bucket for ancestors at levels
616 * "level" and higher. It also handles possible change of mode resulting
617 * from the update. Note that mode can also increase here (MAY_BORROW to
618 * CAN_SEND) because we can use more precise clock that event queue here.
619 * In such case we remove class from event queue first.
620 */
621static void htb_charge_class(struct htb_sched *q, struct htb_class *cl,
622			     int level, struct sk_buff *skb)
623{
624	int bytes = qdisc_pkt_len(skb);
625	enum htb_cmode old_mode;
626	long diff;
627
628	while (cl) {
629		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
630		if (cl->level >= level) {
631			if (cl->level == level)
632				cl->xstats.lends++;
633			htb_accnt_tokens(cl, bytes, diff);
634		} else {
635			cl->xstats.borrows++;
636			cl->tokens += diff;	/* we moved t_c; update tokens */
637		}
638		htb_accnt_ctokens(cl, bytes, diff);
639		cl->t_c = q->now;
640
641		old_mode = cl->cmode;
642		diff = 0;
643		htb_change_class_mode(q, cl, &diff);
644		if (old_mode != cl->cmode) {
645			if (old_mode != HTB_CAN_SEND)
646				htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
647			if (cl->cmode != HTB_CAN_SEND)
648				htb_add_to_wait_tree(q, cl, diff);
649		}
650
651		/* update byte stats except for leaves which are already updated */
652		if (cl->level) {
653			cl->bstats.bytes += bytes;
654			cl->bstats.packets += skb_is_gso(skb)?
655					skb_shinfo(skb)->gso_segs:1;
656		}
657		cl = cl->parent;
658	}
659}
660
661/**
662 * htb_do_events - make mode changes to classes at the level
663 *
664 * Scans event queue for pending events and applies them. Returns time of
665 * next pending event (0 for no event in pq, q->now for too many events).
666 * Note: Applied are events whose have cl->pq_key <= q->now.
667 */
668static psched_time_t htb_do_events(struct htb_sched *q, int level,
669				   unsigned long start)
670{
671	/* don't run for longer than 2 jiffies; 2 is used instead of
672	   1 to simplify things when jiffy is going to be incremented
673	   too soon */
674	unsigned long stop_at = start + 2;
675	while (time_before(jiffies, stop_at)) {
676		struct htb_class *cl;
677		long diff;
678		struct rb_node *p = rb_first(&q->wait_pq[level]);
679
680		if (!p)
681			return 0;
682
683		cl = rb_entry(p, struct htb_class, pq_node);
684		if (cl->pq_key > q->now)
685			return cl->pq_key;
686
687		htb_safe_rb_erase(p, q->wait_pq + level);
688		diff = psched_tdiff_bounded(q->now, cl->t_c, cl->mbuffer);
689		htb_change_class_mode(q, cl, &diff);
690		if (cl->cmode != HTB_CAN_SEND)
691			htb_add_to_wait_tree(q, cl, diff);
692	}
693
694	/* too much load - let's continue after a break for scheduling */
695	if (!(q->warned & HTB_WARN_TOOMANYEVENTS)) {
696		printk(KERN_WARNING "htb: too many events!\n");
697		q->warned |= HTB_WARN_TOOMANYEVENTS;
698	}
699
700	return q->now;
701}
702
703/* Returns class->node+prio from id-tree where classe's id is >= id. NULL
704   is no such one exists. */
705static struct rb_node *htb_id_find_next_upper(int prio, struct rb_node *n,
706					      u32 id)
707{
708	struct rb_node *r = NULL;
709	while (n) {
710		struct htb_class *cl =
711		    rb_entry(n, struct htb_class, node[prio]);
712
713		if (id > cl->common.classid) {
714			n = n->rb_right;
715		} else if (id < cl->common.classid) {
716			r = n;
717			n = n->rb_left;
718		} else {
719			return n;
720		}
721	}
722	return r;
723}
724
725/**
726 * htb_lookup_leaf - returns next leaf class in DRR order
727 *
728 * Find leaf where current feed pointers points to.
729 */
730static struct htb_class *htb_lookup_leaf(struct rb_root *tree, int prio,
731					 struct rb_node **pptr, u32 * pid)
732{
733	int i;
734	struct {
735		struct rb_node *root;
736		struct rb_node **pptr;
737		u32 *pid;
738	} stk[TC_HTB_MAXDEPTH], *sp = stk;
739
740	BUG_ON(!tree->rb_node);
741	sp->root = tree->rb_node;
742	sp->pptr = pptr;
743	sp->pid = pid;
744
745	for (i = 0; i < 65535; i++) {
746		if (!*sp->pptr && *sp->pid) {
747			/* ptr was invalidated but id is valid - try to recover
748			   the original or next ptr */
749			*sp->pptr =
750			    htb_id_find_next_upper(prio, sp->root, *sp->pid);
751		}
752		*sp->pid = 0;	/* ptr is valid now so that remove this hint as it
753				   can become out of date quickly */
754		if (!*sp->pptr) {	/* we are at right end; rewind & go up */
755			*sp->pptr = sp->root;
756			while ((*sp->pptr)->rb_left)
757				*sp->pptr = (*sp->pptr)->rb_left;
758			if (sp > stk) {
759				sp--;
760				if (!*sp->pptr) {
761					WARN_ON(1);
762					return NULL;
763				}
764				htb_next_rb_node(sp->pptr);
765			}
766		} else {
767			struct htb_class *cl;
768			cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
769			if (!cl->level)
770				return cl;
771			(++sp)->root = cl->un.inner.feed[prio].rb_node;
772			sp->pptr = cl->un.inner.ptr + prio;
773			sp->pid = cl->un.inner.last_ptr_id + prio;
774		}
775	}
776	WARN_ON(1);
777	return NULL;
778}
779
780/* dequeues packet at given priority and level; call only if
781   you are sure that there is active class at prio/level */
782static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, int prio,
783					int level)
784{
785	struct sk_buff *skb = NULL;
786	struct htb_class *cl, *start;
787	/* look initial class up in the row */
788	start = cl = htb_lookup_leaf(q->row[level] + prio, prio,
789				     q->ptr[level] + prio,
790				     q->last_ptr_id[level] + prio);
791
792	do {
793next:
794		if (unlikely(!cl))
795			return NULL;
796
797		/* class can be empty - it is unlikely but can be true if leaf
798		   qdisc drops packets in enqueue routine or if someone used
799		   graft operation on the leaf since last dequeue;
800		   simply deactivate and skip such class */
801		if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
802			struct htb_class *next;
803			htb_deactivate(q, cl);
804
805			/* row/level might become empty */
806			if ((q->row_mask[level] & (1 << prio)) == 0)
807				return NULL;
808
809			next = htb_lookup_leaf(q->row[level] + prio,
810					       prio, q->ptr[level] + prio,
811					       q->last_ptr_id[level] + prio);
812
813			if (cl == start)	/* fix start if we just deleted it */
814				start = next;
815			cl = next;
816			goto next;
817		}
818
819		skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
820		if (likely(skb != NULL))
821			break;
822
823		qdisc_warn_nonwc("htb", cl->un.leaf.q);
824		htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
825				  ptr[0]) + prio);
826		cl = htb_lookup_leaf(q->row[level] + prio, prio,
827				     q->ptr[level] + prio,
828				     q->last_ptr_id[level] + prio);
829
830	} while (cl != start);
831
832	if (likely(skb != NULL)) {
833		cl->un.leaf.deficit[level] -= qdisc_pkt_len(skb);
834		if (cl->un.leaf.deficit[level] < 0) {
835			cl->un.leaf.deficit[level] += cl->quantum;
836			htb_next_rb_node((level ? cl->parent->un.inner.ptr : q->
837					  ptr[0]) + prio);
838		}
839		/* this used to be after charge_class but this constelation
840		   gives us slightly better performance */
841		if (!cl->un.leaf.q->q.qlen)
842			htb_deactivate(q, cl);
843		htb_charge_class(q, cl, level, skb);
844	}
845	return skb;
846}
847
848static struct sk_buff *htb_dequeue(struct Qdisc *sch)
849{
850	struct sk_buff *skb = NULL;
851	struct htb_sched *q = qdisc_priv(sch);
852	int level;
853	psched_time_t next_event;
854	unsigned long start_at;
855
856	/* try to dequeue direct packets as high prio (!) to minimize cpu work */
857	skb = __skb_dequeue(&q->direct_queue);
858	if (skb != NULL) {
859		sch->flags &= ~TCQ_F_THROTTLED;
860		sch->q.qlen--;
861		return skb;
862	}
863
864	if (!sch->q.qlen)
865		goto fin;
866	q->now = psched_get_time();
867	start_at = jiffies;
868
869	next_event = q->now + 5 * PSCHED_TICKS_PER_SEC;
870
871	for (level = 0; level < TC_HTB_MAXDEPTH; level++) {
872		/* common case optimization - skip event handler quickly */
873		int m;
874		psched_time_t event;
875
876		if (q->now >= q->near_ev_cache[level]) {
877			event = htb_do_events(q, level, start_at);
878			if (!event)
879				event = q->now + PSCHED_TICKS_PER_SEC;
880			q->near_ev_cache[level] = event;
881		} else
882			event = q->near_ev_cache[level];
883
884		if (next_event > event)
885			next_event = event;
886
887		m = ~q->row_mask[level];
888		while (m != (int)(-1)) {
889			int prio = ffz(m);
890			m |= 1 << prio;
891			skb = htb_dequeue_tree(q, prio, level);
892			if (likely(skb != NULL)) {
893				sch->q.qlen--;
894				sch->flags &= ~TCQ_F_THROTTLED;
895				goto fin;
896			}
897		}
898	}
899	sch->qstats.overlimits++;
900	if (likely(next_event > q->now))
901		qdisc_watchdog_schedule(&q->watchdog, next_event);
902	else
903		schedule_work(&q->work);
904fin:
905	return skb;
906}
907
908/* try to drop from each class (by prio) until one succeed */
909static unsigned int htb_drop(struct Qdisc *sch)
910{
911	struct htb_sched *q = qdisc_priv(sch);
912	int prio;
913
914	for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) {
915		struct list_head *p;
916		list_for_each(p, q->drops + prio) {
917			struct htb_class *cl = list_entry(p, struct htb_class,
918							  un.leaf.drop_list);
919			unsigned int len;
920			if (cl->un.leaf.q->ops->drop &&
921			    (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
922				sch->q.qlen--;
923				if (!cl->un.leaf.q->q.qlen)
924					htb_deactivate(q, cl);
925				return len;
926			}
927		}
928	}
929	return 0;
930}
931
932/* reset all classes */
933/* always caled under BH & queue lock */
934static void htb_reset(struct Qdisc *sch)
935{
936	struct htb_sched *q = qdisc_priv(sch);
937	struct htb_class *cl;
938	struct hlist_node *n;
939	unsigned int i;
940
941	for (i = 0; i < q->clhash.hashsize; i++) {
942		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
943			if (cl->level)
944				memset(&cl->un.inner, 0, sizeof(cl->un.inner));
945			else {
946				if (cl->un.leaf.q)
947					qdisc_reset(cl->un.leaf.q);
948				INIT_LIST_HEAD(&cl->un.leaf.drop_list);
949			}
950			cl->prio_activity = 0;
951			cl->cmode = HTB_CAN_SEND;
952
953		}
954	}
955	qdisc_watchdog_cancel(&q->watchdog);
956	__skb_queue_purge(&q->direct_queue);
957	sch->q.qlen = 0;
958	memset(q->row, 0, sizeof(q->row));
959	memset(q->row_mask, 0, sizeof(q->row_mask));
960	memset(q->wait_pq, 0, sizeof(q->wait_pq));
961	memset(q->ptr, 0, sizeof(q->ptr));
962	for (i = 0; i < TC_HTB_NUMPRIO; i++)
963		INIT_LIST_HEAD(q->drops + i);
964}
965
966static const struct nla_policy htb_policy[TCA_HTB_MAX + 1] = {
967	[TCA_HTB_PARMS]	= { .len = sizeof(struct tc_htb_opt) },
968	[TCA_HTB_INIT]	= { .len = sizeof(struct tc_htb_glob) },
969	[TCA_HTB_CTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
970	[TCA_HTB_RTAB]	= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
971};
972
973static void htb_work_func(struct work_struct *work)
974{
975	struct htb_sched *q = container_of(work, struct htb_sched, work);
976	struct Qdisc *sch = q->watchdog.qdisc;
977
978	__netif_schedule(qdisc_root(sch));
979}
980
981static int htb_init(struct Qdisc *sch, struct nlattr *opt)
982{
983	struct htb_sched *q = qdisc_priv(sch);
984	struct nlattr *tb[TCA_HTB_INIT + 1];
985	struct tc_htb_glob *gopt;
986	int err;
987	int i;
988
989	if (!opt)
990		return -EINVAL;
991
992	err = nla_parse_nested(tb, TCA_HTB_INIT, opt, htb_policy);
993	if (err < 0)
994		return err;
995
996	if (tb[TCA_HTB_INIT] == NULL) {
997		printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n");
998		return -EINVAL;
999	}
1000	gopt = nla_data(tb[TCA_HTB_INIT]);
1001	if (gopt->version != HTB_VER >> 16) {
1002		printk(KERN_ERR
1003		       "HTB: need tc/htb version %d (minor is %d), you have %d\n",
1004		       HTB_VER >> 16, HTB_VER & 0xffff, gopt->version);
1005		return -EINVAL;
1006	}
1007
1008	err = qdisc_class_hash_init(&q->clhash);
1009	if (err < 0)
1010		return err;
1011	for (i = 0; i < TC_HTB_NUMPRIO; i++)
1012		INIT_LIST_HEAD(q->drops + i);
1013
1014	qdisc_watchdog_init(&q->watchdog, sch);
1015	INIT_WORK(&q->work, htb_work_func);
1016	skb_queue_head_init(&q->direct_queue);
1017
1018	q->direct_qlen = qdisc_dev(sch)->tx_queue_len;
1019	if (q->direct_qlen < 2)	/* some devices have zero tx_queue_len */
1020		q->direct_qlen = 2;
1021
1022	if ((q->rate2quantum = gopt->rate2quantum) < 1)
1023		q->rate2quantum = 1;
1024	q->defcls = gopt->defcls;
1025
1026	return 0;
1027}
1028
1029static int htb_dump(struct Qdisc *sch, struct sk_buff *skb)
1030{
1031	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1032	struct htb_sched *q = qdisc_priv(sch);
1033	struct nlattr *nest;
1034	struct tc_htb_glob gopt;
1035
1036	spin_lock_bh(root_lock);
1037
1038	gopt.direct_pkts = q->direct_pkts;
1039	gopt.version = HTB_VER;
1040	gopt.rate2quantum = q->rate2quantum;
1041	gopt.defcls = q->defcls;
1042	gopt.debug = 0;
1043
1044	nest = nla_nest_start(skb, TCA_OPTIONS);
1045	if (nest == NULL)
1046		goto nla_put_failure;
1047	NLA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt);
1048	nla_nest_end(skb, nest);
1049
1050	spin_unlock_bh(root_lock);
1051	return skb->len;
1052
1053nla_put_failure:
1054	spin_unlock_bh(root_lock);
1055	nla_nest_cancel(skb, nest);
1056	return -1;
1057}
1058
1059static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
1060			  struct sk_buff *skb, struct tcmsg *tcm)
1061{
1062	struct htb_class *cl = (struct htb_class *)arg;
1063	spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
1064	struct nlattr *nest;
1065	struct tc_htb_opt opt;
1066
1067	spin_lock_bh(root_lock);
1068	tcm->tcm_parent = cl->parent ? cl->parent->common.classid : TC_H_ROOT;
1069	tcm->tcm_handle = cl->common.classid;
1070	if (!cl->level && cl->un.leaf.q)
1071		tcm->tcm_info = cl->un.leaf.q->handle;
1072
1073	nest = nla_nest_start(skb, TCA_OPTIONS);
1074	if (nest == NULL)
1075		goto nla_put_failure;
1076
1077	memset(&opt, 0, sizeof(opt));
1078
1079	opt.rate = cl->rate->rate;
1080	opt.buffer = cl->buffer;
1081	opt.ceil = cl->ceil->rate;
1082	opt.cbuffer = cl->cbuffer;
1083	opt.quantum = cl->quantum;
1084	opt.prio = cl->prio;
1085	opt.level = cl->level;
1086	NLA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt);
1087
1088	nla_nest_end(skb, nest);
1089	spin_unlock_bh(root_lock);
1090	return skb->len;
1091
1092nla_put_failure:
1093	spin_unlock_bh(root_lock);
1094	nla_nest_cancel(skb, nest);
1095	return -1;
1096}
1097
1098static int
1099htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
1100{
1101	struct htb_class *cl = (struct htb_class *)arg;
1102
1103	if (!cl->level && cl->un.leaf.q)
1104		cl->qstats.qlen = cl->un.leaf.q->q.qlen;
1105	cl->xstats.tokens = cl->tokens;
1106	cl->xstats.ctokens = cl->ctokens;
1107
1108	if (gnet_stats_copy_basic(d, &cl->bstats) < 0 ||
1109	    gnet_stats_copy_rate_est(d, NULL, &cl->rate_est) < 0 ||
1110	    gnet_stats_copy_queue(d, &cl->qstats) < 0)
1111		return -1;
1112
1113	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1114}
1115
1116static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1117		     struct Qdisc **old)
1118{
1119	struct htb_class *cl = (struct htb_class *)arg;
1120
1121	if (cl->level)
1122		return -EINVAL;
1123	if (new == NULL &&
1124	    (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1125				     &pfifo_qdisc_ops,
1126				     cl->common.classid)) == NULL)
1127		return -ENOBUFS;
1128
1129	sch_tree_lock(sch);
1130	*old = cl->un.leaf.q;
1131	cl->un.leaf.q = new;
1132	if (*old != NULL) {
1133		qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
1134		qdisc_reset(*old);
1135	}
1136	sch_tree_unlock(sch);
1137	return 0;
1138}
1139
1140static struct Qdisc *htb_leaf(struct Qdisc *sch, unsigned long arg)
1141{
1142	struct htb_class *cl = (struct htb_class *)arg;
1143	return !cl->level ? cl->un.leaf.q : NULL;
1144}
1145
1146static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg)
1147{
1148	struct htb_class *cl = (struct htb_class *)arg;
1149
1150	if (cl->un.leaf.q->q.qlen == 0)
1151		htb_deactivate(qdisc_priv(sch), cl);
1152}
1153
1154static unsigned long htb_get(struct Qdisc *sch, u32 classid)
1155{
1156	struct htb_class *cl = htb_find(classid, sch);
1157	if (cl)
1158		cl->refcnt++;
1159	return (unsigned long)cl;
1160}
1161
1162static inline int htb_parent_last_child(struct htb_class *cl)
1163{
1164	if (!cl->parent)
1165		/* the root class */
1166		return 0;
1167	if (cl->parent->children > 1)
1168		/* not the last child */
1169		return 0;
1170	return 1;
1171}
1172
1173static void htb_parent_to_leaf(struct htb_sched *q, struct htb_class *cl,
1174			       struct Qdisc *new_q)
1175{
1176	struct htb_class *parent = cl->parent;
1177
1178	WARN_ON(cl->level || !cl->un.leaf.q || cl->prio_activity);
1179
1180	if (parent->cmode != HTB_CAN_SEND)
1181		htb_safe_rb_erase(&parent->pq_node, q->wait_pq + parent->level);
1182
1183	parent->level = 0;
1184	memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1185	INIT_LIST_HEAD(&parent->un.leaf.drop_list);
1186	parent->un.leaf.q = new_q ? new_q : &noop_qdisc;
1187	parent->tokens = parent->buffer;
1188	parent->ctokens = parent->cbuffer;
1189	parent->t_c = psched_get_time();
1190	parent->cmode = HTB_CAN_SEND;
1191}
1192
1193static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
1194{
1195	if (!cl->level) {
1196		WARN_ON(!cl->un.leaf.q);
1197		qdisc_destroy(cl->un.leaf.q);
1198	}
1199	gen_kill_estimator(&cl->bstats, &cl->rate_est);
1200	qdisc_put_rtab(cl->rate);
1201	qdisc_put_rtab(cl->ceil);
1202
1203	tcf_destroy_chain(&cl->filter_list);
1204	kfree(cl);
1205}
1206
1207static void htb_destroy(struct Qdisc *sch)
1208{
1209	struct htb_sched *q = qdisc_priv(sch);
1210	struct hlist_node *n, *next;
1211	struct htb_class *cl;
1212	unsigned int i;
1213
1214	cancel_work_sync(&q->work);
1215	qdisc_watchdog_cancel(&q->watchdog);
1216	/* This line used to be after htb_destroy_class call below
1217	   and surprisingly it worked in 2.4. But it must precede it
1218	   because filter need its target class alive to be able to call
1219	   unbind_filter on it (without Oops). */
1220	tcf_destroy_chain(&q->filter_list);
1221
1222	for (i = 0; i < q->clhash.hashsize; i++) {
1223		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode)
1224			tcf_destroy_chain(&cl->filter_list);
1225	}
1226	for (i = 0; i < q->clhash.hashsize; i++) {
1227		hlist_for_each_entry_safe(cl, n, next, &q->clhash.hash[i],
1228					  common.hnode)
1229			htb_destroy_class(sch, cl);
1230	}
1231	qdisc_class_hash_destroy(&q->clhash);
1232	__skb_queue_purge(&q->direct_queue);
1233}
1234
1235static int htb_delete(struct Qdisc *sch, unsigned long arg)
1236{
1237	struct htb_sched *q = qdisc_priv(sch);
1238	struct htb_class *cl = (struct htb_class *)arg;
1239	unsigned int qlen;
1240	struct Qdisc *new_q = NULL;
1241	int last_child = 0;
1242
1243	// TODO: why don't allow to delete subtree ? references ? does
1244	// tc subsys quarantee us that in htb_destroy it holds no class
1245	// refs so that we can remove children safely there ?
1246	if (cl->children || cl->filter_cnt)
1247		return -EBUSY;
1248
1249	if (!cl->level && htb_parent_last_child(cl)) {
1250		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1251					  &pfifo_qdisc_ops,
1252					  cl->parent->common.classid);
1253		last_child = 1;
1254	}
1255
1256	sch_tree_lock(sch);
1257
1258	if (!cl->level) {
1259		qlen = cl->un.leaf.q->q.qlen;
1260		qdisc_reset(cl->un.leaf.q);
1261		qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
1262	}
1263
1264	/* delete from hash and active; remainder in destroy_class */
1265	qdisc_class_hash_remove(&q->clhash, &cl->common);
1266	if (cl->parent)
1267		cl->parent->children--;
1268
1269	if (cl->prio_activity)
1270		htb_deactivate(q, cl);
1271
1272	if (cl->cmode != HTB_CAN_SEND)
1273		htb_safe_rb_erase(&cl->pq_node, q->wait_pq + cl->level);
1274
1275	if (last_child)
1276		htb_parent_to_leaf(q, cl, new_q);
1277
1278	BUG_ON(--cl->refcnt == 0);
1279	/*
1280	 * This shouldn't happen: we "hold" one cops->get() when called
1281	 * from tc_ctl_tclass; the destroy method is done from cops->put().
1282	 */
1283
1284	sch_tree_unlock(sch);
1285	return 0;
1286}
1287
1288static void htb_put(struct Qdisc *sch, unsigned long arg)
1289{
1290	struct htb_class *cl = (struct htb_class *)arg;
1291
1292	if (--cl->refcnt == 0)
1293		htb_destroy_class(sch, cl);
1294}
1295
1296static int htb_change_class(struct Qdisc *sch, u32 classid,
1297			    u32 parentid, struct nlattr **tca,
1298			    unsigned long *arg)
1299{
1300	int err = -EINVAL;
1301	struct htb_sched *q = qdisc_priv(sch);
1302	struct htb_class *cl = (struct htb_class *)*arg, *parent;
1303	struct nlattr *opt = tca[TCA_OPTIONS];
1304	struct qdisc_rate_table *rtab = NULL, *ctab = NULL;
1305	struct nlattr *tb[TCA_HTB_RTAB + 1];
1306	struct tc_htb_opt *hopt;
1307
1308	/* extract all subattrs from opt attr */
1309	if (!opt)
1310		goto failure;
1311
1312	err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy);
1313	if (err < 0)
1314		goto failure;
1315
1316	err = -EINVAL;
1317	if (tb[TCA_HTB_PARMS] == NULL)
1318		goto failure;
1319
1320	parent = parentid == TC_H_ROOT ? NULL : htb_find(parentid, sch);
1321
1322	hopt = nla_data(tb[TCA_HTB_PARMS]);
1323
1324	rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB]);
1325	ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB]);
1326	if (!rtab || !ctab)
1327		goto failure;
1328
1329	if (!cl) {		/* new class */
1330		struct Qdisc *new_q;
1331		int prio;
1332		struct {
1333			struct nlattr		nla;
1334			struct gnet_estimator	opt;
1335		} est = {
1336			.nla = {
1337				.nla_len	= nla_attr_size(sizeof(est.opt)),
1338				.nla_type	= TCA_RATE,
1339			},
1340			.opt = {
1341				/* 4s interval, 16s averaging constant */
1342				.interval	= 2,
1343				.ewma_log	= 2,
1344			},
1345		};
1346
1347		/* check for valid classid */
1348		if (!classid || TC_H_MAJ(classid ^ sch->handle) ||
1349		    htb_find(classid, sch))
1350			goto failure;
1351
1352		/* check maximal depth */
1353		if (parent && parent->parent && parent->parent->level < 2) {
1354			printk(KERN_ERR "htb: tree is too deep\n");
1355			goto failure;
1356		}
1357		err = -ENOBUFS;
1358		if ((cl = kzalloc(sizeof(*cl), GFP_KERNEL)) == NULL)
1359			goto failure;
1360
1361		err = gen_new_estimator(&cl->bstats, &cl->rate_est,
1362					qdisc_root_sleeping_lock(sch),
1363					tca[TCA_RATE] ? : &est.nla);
1364		if (err) {
1365			kfree(cl);
1366			goto failure;
1367		}
1368
1369		cl->refcnt = 1;
1370		cl->children = 0;
1371		INIT_LIST_HEAD(&cl->un.leaf.drop_list);
1372		RB_CLEAR_NODE(&cl->pq_node);
1373
1374		for (prio = 0; prio < TC_HTB_NUMPRIO; prio++)
1375			RB_CLEAR_NODE(&cl->node[prio]);
1376
1377		/* create leaf qdisc early because it uses kmalloc(GFP_KERNEL)
1378		   so that can't be used inside of sch_tree_lock
1379		   -- thanks to Karlis Peisenieks */
1380		new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue,
1381					  &pfifo_qdisc_ops, classid);
1382		sch_tree_lock(sch);
1383		if (parent && !parent->level) {
1384			unsigned int qlen = parent->un.leaf.q->q.qlen;
1385
1386			/* turn parent into inner node */
1387			qdisc_reset(parent->un.leaf.q);
1388			qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
1389			qdisc_destroy(parent->un.leaf.q);
1390			if (parent->prio_activity)
1391				htb_deactivate(q, parent);
1392
1393			/* remove from evt list because of level change */
1394			if (parent->cmode != HTB_CAN_SEND) {
1395				htb_safe_rb_erase(&parent->pq_node, q->wait_pq);
1396				parent->cmode = HTB_CAN_SEND;
1397			}
1398			parent->level = (parent->parent ? parent->parent->level
1399					 : TC_HTB_MAXDEPTH) - 1;
1400			memset(&parent->un.inner, 0, sizeof(parent->un.inner));
1401		}
1402		/* leaf (we) needs elementary qdisc */
1403		cl->un.leaf.q = new_q ? new_q : &noop_qdisc;
1404
1405		cl->common.classid = classid;
1406		cl->parent = parent;
1407
1408		/* set class to be in HTB_CAN_SEND state */
1409		cl->tokens = hopt->buffer;
1410		cl->ctokens = hopt->cbuffer;
1411		cl->mbuffer = 60 * PSCHED_TICKS_PER_SEC;	/* 1min */
1412		cl->t_c = psched_get_time();
1413		cl->cmode = HTB_CAN_SEND;
1414
1415		/* attach to the hash list and parent's family */
1416		qdisc_class_hash_insert(&q->clhash, &cl->common);
1417		if (parent)
1418			parent->children++;
1419	} else {
1420		if (tca[TCA_RATE]) {
1421			err = gen_replace_estimator(&cl->bstats, &cl->rate_est,
1422						    qdisc_root_sleeping_lock(sch),
1423						    tca[TCA_RATE]);
1424			if (err)
1425				return err;
1426		}
1427		sch_tree_lock(sch);
1428	}
1429
1430	/* it used to be a nasty bug here, we have to check that node
1431	   is really leaf before changing cl->un.leaf ! */
1432	if (!cl->level) {
1433		cl->quantum = rtab->rate.rate / q->rate2quantum;
1434		if (!hopt->quantum && cl->quantum < 1000) {
1435			printk(KERN_WARNING
1436			       "HTB: quantum of class %X is small. Consider r2q change.\n",
1437			       cl->common.classid);
1438			cl->quantum = 1000;
1439		}
1440		if (!hopt->quantum && cl->quantum > 200000) {
1441			printk(KERN_WARNING
1442			       "HTB: quantum of class %X is big. Consider r2q change.\n",
1443			       cl->common.classid);
1444			cl->quantum = 200000;
1445		}
1446		if (hopt->quantum)
1447			cl->quantum = hopt->quantum;
1448		if ((cl->prio = hopt->prio) >= TC_HTB_NUMPRIO)
1449			cl->prio = TC_HTB_NUMPRIO - 1;
1450	}
1451
1452	cl->buffer = hopt->buffer;
1453	cl->cbuffer = hopt->cbuffer;
1454	if (cl->rate)
1455		qdisc_put_rtab(cl->rate);
1456	cl->rate = rtab;
1457	if (cl->ceil)
1458		qdisc_put_rtab(cl->ceil);
1459	cl->ceil = ctab;
1460	sch_tree_unlock(sch);
1461
1462	qdisc_class_hash_grow(sch, &q->clhash);
1463
1464	*arg = (unsigned long)cl;
1465	return 0;
1466
1467failure:
1468	if (rtab)
1469		qdisc_put_rtab(rtab);
1470	if (ctab)
1471		qdisc_put_rtab(ctab);
1472	return err;
1473}
1474
1475static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg)
1476{
1477	struct htb_sched *q = qdisc_priv(sch);
1478	struct htb_class *cl = (struct htb_class *)arg;
1479	struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list;
1480
1481	return fl;
1482}
1483
1484static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent,
1485				     u32 classid)
1486{
1487	struct htb_class *cl = htb_find(classid, sch);
1488
1489	/*if (cl && !cl->level) return 0;
1490	   The line above used to be there to prevent attaching filters to
1491	   leaves. But at least tc_index filter uses this just to get class
1492	   for other reasons so that we have to allow for it.
1493	   ----
1494	   19.6.2002 As Werner explained it is ok - bind filter is just
1495	   another way to "lock" the class - unlike "get" this lock can
1496	   be broken by class during destroy IIUC.
1497	 */
1498	if (cl)
1499		cl->filter_cnt++;
1500	return (unsigned long)cl;
1501}
1502
1503static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg)
1504{
1505	struct htb_class *cl = (struct htb_class *)arg;
1506
1507	if (cl)
1508		cl->filter_cnt--;
1509}
1510
1511static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1512{
1513	struct htb_sched *q = qdisc_priv(sch);
1514	struct htb_class *cl;
1515	struct hlist_node *n;
1516	unsigned int i;
1517
1518	if (arg->stop)
1519		return;
1520
1521	for (i = 0; i < q->clhash.hashsize; i++) {
1522		hlist_for_each_entry(cl, n, &q->clhash.hash[i], common.hnode) {
1523			if (arg->count < arg->skip) {
1524				arg->count++;
1525				continue;
1526			}
1527			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1528				arg->stop = 1;
1529				return;
1530			}
1531			arg->count++;
1532		}
1533	}
1534}
1535
1536static const struct Qdisc_class_ops htb_class_ops = {
1537	.graft		=	htb_graft,
1538	.leaf		=	htb_leaf,
1539	.qlen_notify	=	htb_qlen_notify,
1540	.get		=	htb_get,
1541	.put		=	htb_put,
1542	.change		=	htb_change_class,
1543	.delete		=	htb_delete,
1544	.walk		=	htb_walk,
1545	.tcf_chain	=	htb_find_tcf,
1546	.bind_tcf	=	htb_bind_filter,
1547	.unbind_tcf	=	htb_unbind_filter,
1548	.dump		=	htb_dump_class,
1549	.dump_stats	=	htb_dump_class_stats,
1550};
1551
1552static struct Qdisc_ops htb_qdisc_ops __read_mostly = {
1553	.cl_ops		=	&htb_class_ops,
1554	.id		=	"htb",
1555	.priv_size	=	sizeof(struct htb_sched),
1556	.enqueue	=	htb_enqueue,
1557	.dequeue	=	htb_dequeue,
1558	.peek		=	qdisc_peek_dequeued,
1559	.drop		=	htb_drop,
1560	.init		=	htb_init,
1561	.reset		=	htb_reset,
1562	.destroy	=	htb_destroy,
1563	.dump		=	htb_dump,
1564	.owner		=	THIS_MODULE,
1565};
1566
1567static int __init htb_module_init(void)
1568{
1569	return register_qdisc(&htb_qdisc_ops);
1570}
1571static void __exit htb_module_exit(void)
1572{
1573	unregister_qdisc(&htb_qdisc_ops);
1574}
1575
1576module_init(htb_module_init)
1577module_exit(htb_module_exit)
1578MODULE_LICENSE("GPL");
1579