• Home
  • History
  • Annotate
  • Line#
  • Navigate
  • Raw
  • Download
  • only in /asuswrt-rt-n18u-9.0.0.4.380.2695/release/src-rt-6.x.4708/linux/linux-2.6.36/net/sched/
1/*
2 * net/sched/sch_api.c	Packet scheduler API.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/string.h>
22#include <linux/errno.h>
23#include <linux/skbuff.h>
24#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
29#include <linux/hrtimer.h>
30#include <linux/lockdep.h>
31#include <linux/slab.h>
32
33#include <net/net_namespace.h>
34#include <net/sock.h>
35#include <net/netlink.h>
36#include <net/pkt_sched.h>
37
38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39			struct nlmsghdr *n, u32 clid,
40			struct Qdisc *old, struct Qdisc *new);
41static int tclass_notify(struct net *net, struct sk_buff *oskb,
42			 struct nlmsghdr *n, struct Qdisc *q,
43			 unsigned long cl, int event);
44
45/*
46
47   Short review.
48   -------------
49
50   This file consists of two interrelated parts:
51
52   1. queueing disciplines manager frontend.
53   2. traffic classes manager frontend.
54
55   Generally, queueing discipline ("qdisc") is a black box,
56   which is able to enqueue packets and to dequeue them (when
57   device is ready to send something) in order and at times
58   determined by algorithm hidden in it.
59
60   qdisc's are divided to two categories:
61   - "queues", which have no internal structure visible from outside.
62   - "schedulers", which split all the packets to "traffic classes",
63     using "packet classifiers" (look at cls_api.c)
64
65   In turn, classes may have child qdiscs (as rule, queues)
66   attached to them etc. etc. etc.
67
68   The goal of the routines in this file is to translate
69   information supplied by user in the form of handles
70   to more intelligible for kernel form, to make some sanity
71   checks and part of work, which is common to all qdiscs
72   and to provide rtnetlink notifications.
73
74   All real intelligent work is done inside qdisc modules.
75
76
77
78   Every discipline has two major routines: enqueue and dequeue.
79
80   ---dequeue
81
82   dequeue usually returns a skb to send. It is allowed to return NULL,
83   but it does not mean that queue is empty, it just means that
84   discipline does not want to send anything this time.
85   Queue is really empty if q->q.qlen == 0.
86   For complicated disciplines with multiple queues q->q is not
87   real packet queue, but however q->q.qlen must be valid.
88
89   ---enqueue
90
91   enqueue returns 0, if packet was enqueued successfully.
92   If packet (this one or another one) was dropped, it returns
93   not zero error code.
94   NET_XMIT_DROP 	- this packet dropped
95     Expected action: do not backoff, but wait until queue will clear.
96   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
97     Expected action: backoff or ignore
98   NET_XMIT_POLICED	- dropped by police.
99     Expected action: backoff or error to real-time apps.
100
101   Auxiliary routines:
102
103   ---peek
104
105   like dequeue but without removing a packet from the queue
106
107   ---reset
108
109   returns qdisc to initial state: purge all buffers, clear all
110   timers, counters (except for statistics) etc.
111
112   ---init
113
114   initializes newly created qdisc.
115
116   ---destroy
117
118   destroys resources allocated by init and during lifetime of qdisc.
119
120   ---change
121
122   changes qdisc parameters.
123 */
124
125/* Protects list of registered TC modules. It is pure SMP lock. */
126static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129/************************************************
130 *	Queueing disciplines manipulation.	*
131 ************************************************/
132
133
134/* The list of all installed queueing disciplines. */
135
136static struct Qdisc_ops *qdisc_base;
137
138/* Register/uregister queueing discipline */
139
140int register_qdisc(struct Qdisc_ops *qops)
141{
142	struct Qdisc_ops *q, **qp;
143	int rc = -EEXIST;
144
145	write_lock(&qdisc_mod_lock);
146	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147		if (!strcmp(qops->id, q->id))
148			goto out;
149
150	if (qops->enqueue == NULL)
151		qops->enqueue = noop_qdisc_ops.enqueue;
152	if (qops->peek == NULL) {
153		if (qops->dequeue == NULL)
154			qops->peek = noop_qdisc_ops.peek;
155		else
156			goto out_einval;
157	}
158	if (qops->dequeue == NULL)
159		qops->dequeue = noop_qdisc_ops.dequeue;
160
161	if (qops->cl_ops) {
162		const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164		if (!(cops->get && cops->put && cops->walk && cops->leaf))
165			goto out_einval;
166
167		if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168			goto out_einval;
169	}
170
171	qops->next = NULL;
172	*qp = qops;
173	rc = 0;
174out:
175	write_unlock(&qdisc_mod_lock);
176	return rc;
177
178out_einval:
179	rc = -EINVAL;
180	goto out;
181}
182EXPORT_SYMBOL(register_qdisc);
183
184int unregister_qdisc(struct Qdisc_ops *qops)
185{
186	struct Qdisc_ops *q, **qp;
187	int err = -ENOENT;
188
189	write_lock(&qdisc_mod_lock);
190	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
191		if (q == qops)
192			break;
193	if (q) {
194		*qp = q->next;
195		q->next = NULL;
196		err = 0;
197	}
198	write_unlock(&qdisc_mod_lock);
199	return err;
200}
201EXPORT_SYMBOL(unregister_qdisc);
202
203/* We know handle. Find qdisc among all qdisc's attached to device
204   (root qdisc, all its children, children of children etc.)
205 */
206
207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208{
209	struct Qdisc *q;
210
211	if (!(root->flags & TCQ_F_BUILTIN) &&
212	    root->handle == handle)
213		return root;
214
215	list_for_each_entry(q, &root->list, list) {
216		if (q->handle == handle)
217			return q;
218	}
219	return NULL;
220}
221
222static void qdisc_list_add(struct Qdisc *q)
223{
224	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225		list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226}
227
228void qdisc_list_del(struct Qdisc *q)
229{
230	if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231		list_del(&q->list);
232}
233EXPORT_SYMBOL(qdisc_list_del);
234
235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236{
237	struct Qdisc *q;
238
239	q = qdisc_match_from_root(dev->qdisc, handle);
240	if (q)
241		goto out;
242
243	q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle);
244out:
245	return q;
246}
247
248static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
249{
250	unsigned long cl;
251	struct Qdisc *leaf;
252	const struct Qdisc_class_ops *cops = p->ops->cl_ops;
253
254	if (cops == NULL)
255		return NULL;
256	cl = cops->get(p, classid);
257
258	if (cl == 0)
259		return NULL;
260	leaf = cops->leaf(p, cl);
261	cops->put(p, cl);
262	return leaf;
263}
264
265/* Find queueing discipline by name */
266
267static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
268{
269	struct Qdisc_ops *q = NULL;
270
271	if (kind) {
272		read_lock(&qdisc_mod_lock);
273		for (q = qdisc_base; q; q = q->next) {
274			if (nla_strcmp(kind, q->id) == 0) {
275				if (!try_module_get(q->owner))
276					q = NULL;
277				break;
278			}
279		}
280		read_unlock(&qdisc_mod_lock);
281	}
282	return q;
283}
284
285static struct qdisc_rate_table *qdisc_rtab_list;
286
287struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
288{
289	struct qdisc_rate_table *rtab;
290
291	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
292		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
293			rtab->refcnt++;
294			return rtab;
295		}
296	}
297
298	if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
299	    nla_len(tab) != TC_RTAB_SIZE)
300		return NULL;
301
302	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
303	if (rtab) {
304		rtab->rate = *r;
305		rtab->refcnt = 1;
306		memcpy(rtab->data, nla_data(tab), 1024);
307		rtab->next = qdisc_rtab_list;
308		qdisc_rtab_list = rtab;
309	}
310	return rtab;
311}
312EXPORT_SYMBOL(qdisc_get_rtab);
313
314void qdisc_put_rtab(struct qdisc_rate_table *tab)
315{
316	struct qdisc_rate_table *rtab, **rtabp;
317
318	if (!tab || --tab->refcnt)
319		return;
320
321	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
322		if (rtab == tab) {
323			*rtabp = rtab->next;
324			kfree(rtab);
325			return;
326		}
327	}
328}
329EXPORT_SYMBOL(qdisc_put_rtab);
330
331static LIST_HEAD(qdisc_stab_list);
332static DEFINE_SPINLOCK(qdisc_stab_lock);
333
334static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
335	[TCA_STAB_BASE]	= { .len = sizeof(struct tc_sizespec) },
336	[TCA_STAB_DATA] = { .type = NLA_BINARY },
337};
338
339static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
340{
341	struct nlattr *tb[TCA_STAB_MAX + 1];
342	struct qdisc_size_table *stab;
343	struct tc_sizespec *s;
344	unsigned int tsize = 0;
345	u16 *tab = NULL;
346	int err;
347
348	err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
349	if (err < 0)
350		return ERR_PTR(err);
351	if (!tb[TCA_STAB_BASE])
352		return ERR_PTR(-EINVAL);
353
354	s = nla_data(tb[TCA_STAB_BASE]);
355
356	if (s->tsize > 0) {
357		if (!tb[TCA_STAB_DATA])
358			return ERR_PTR(-EINVAL);
359		tab = nla_data(tb[TCA_STAB_DATA]);
360		tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
361	}
362
363	if (!s || tsize != s->tsize || (!tab && tsize > 0))
364		return ERR_PTR(-EINVAL);
365
366	spin_lock(&qdisc_stab_lock);
367
368	list_for_each_entry(stab, &qdisc_stab_list, list) {
369		if (memcmp(&stab->szopts, s, sizeof(*s)))
370			continue;
371		if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
372			continue;
373		stab->refcnt++;
374		spin_unlock(&qdisc_stab_lock);
375		return stab;
376	}
377
378	spin_unlock(&qdisc_stab_lock);
379
380	stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
381	if (!stab)
382		return ERR_PTR(-ENOMEM);
383
384	stab->refcnt = 1;
385	stab->szopts = *s;
386	if (tsize > 0)
387		memcpy(stab->data, tab, tsize * sizeof(u16));
388
389	spin_lock(&qdisc_stab_lock);
390	list_add_tail(&stab->list, &qdisc_stab_list);
391	spin_unlock(&qdisc_stab_lock);
392
393	return stab;
394}
395
396void qdisc_put_stab(struct qdisc_size_table *tab)
397{
398	if (!tab)
399		return;
400
401	spin_lock(&qdisc_stab_lock);
402
403	if (--tab->refcnt == 0) {
404		list_del(&tab->list);
405		kfree(tab);
406	}
407
408	spin_unlock(&qdisc_stab_lock);
409}
410EXPORT_SYMBOL(qdisc_put_stab);
411
412static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
413{
414	struct nlattr *nest;
415
416	nest = nla_nest_start(skb, TCA_STAB);
417	if (nest == NULL)
418		goto nla_put_failure;
419	NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
420	nla_nest_end(skb, nest);
421
422	return skb->len;
423
424nla_put_failure:
425	return -1;
426}
427
428void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
429{
430	int pkt_len, slot;
431
432	pkt_len = skb->len + stab->szopts.overhead;
433	if (unlikely(!stab->szopts.tsize))
434		goto out;
435
436	slot = pkt_len + stab->szopts.cell_align;
437	if (unlikely(slot < 0))
438		slot = 0;
439
440	slot >>= stab->szopts.cell_log;
441	if (likely(slot < stab->szopts.tsize))
442		pkt_len = stab->data[slot];
443	else
444		pkt_len = stab->data[stab->szopts.tsize - 1] *
445				(slot / stab->szopts.tsize) +
446				stab->data[slot % stab->szopts.tsize];
447
448	pkt_len <<= stab->szopts.size_log;
449out:
450	if (unlikely(pkt_len < 1))
451		pkt_len = 1;
452	qdisc_skb_cb(skb)->pkt_len = pkt_len;
453}
454EXPORT_SYMBOL(qdisc_calculate_pkt_len);
455
456void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
457{
458	if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
459		printk(KERN_WARNING
460		       "%s: %s qdisc %X: is non-work-conserving?\n",
461		       txt, qdisc->ops->id, qdisc->handle >> 16);
462		qdisc->flags |= TCQ_F_WARN_NONWC;
463	}
464}
465EXPORT_SYMBOL(qdisc_warn_nonwc);
466
467static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
468{
469	struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
470						 timer);
471
472	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
473	__netif_schedule(qdisc_root(wd->qdisc));
474
475	return HRTIMER_NORESTART;
476}
477
478void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
479{
480	hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
481	wd->timer.function = qdisc_watchdog;
482	wd->qdisc = qdisc;
483}
484EXPORT_SYMBOL(qdisc_watchdog_init);
485
486void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
487{
488	ktime_t time;
489
490	if (test_bit(__QDISC_STATE_DEACTIVATED,
491		     &qdisc_root_sleeping(wd->qdisc)->state))
492		return;
493
494	wd->qdisc->flags |= TCQ_F_THROTTLED;
495	time = ktime_set(0, 0);
496	time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
497	hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
498}
499EXPORT_SYMBOL(qdisc_watchdog_schedule);
500
501void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
502{
503	hrtimer_cancel(&wd->timer);
504	wd->qdisc->flags &= ~TCQ_F_THROTTLED;
505}
506EXPORT_SYMBOL(qdisc_watchdog_cancel);
507
508static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
509{
510	unsigned int size = n * sizeof(struct hlist_head), i;
511	struct hlist_head *h;
512
513	if (size <= PAGE_SIZE)
514		h = kmalloc(size, GFP_KERNEL);
515	else
516		h = (struct hlist_head *)
517			__get_free_pages(GFP_KERNEL, get_order(size));
518
519	if (h != NULL) {
520		for (i = 0; i < n; i++)
521			INIT_HLIST_HEAD(&h[i]);
522	}
523	return h;
524}
525
526static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
527{
528	unsigned int size = n * sizeof(struct hlist_head);
529
530	if (size <= PAGE_SIZE)
531		kfree(h);
532	else
533		free_pages((unsigned long)h, get_order(size));
534}
535
536void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
537{
538	struct Qdisc_class_common *cl;
539	struct hlist_node *n, *next;
540	struct hlist_head *nhash, *ohash;
541	unsigned int nsize, nmask, osize;
542	unsigned int i, h;
543
544	/* Rehash when load factor exceeds 0.75 */
545	if (clhash->hashelems * 4 <= clhash->hashsize * 3)
546		return;
547	nsize = clhash->hashsize * 2;
548	nmask = nsize - 1;
549	nhash = qdisc_class_hash_alloc(nsize);
550	if (nhash == NULL)
551		return;
552
553	ohash = clhash->hash;
554	osize = clhash->hashsize;
555
556	sch_tree_lock(sch);
557	for (i = 0; i < osize; i++) {
558		hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
559			h = qdisc_class_hash(cl->classid, nmask);
560			hlist_add_head(&cl->hnode, &nhash[h]);
561		}
562	}
563	clhash->hash     = nhash;
564	clhash->hashsize = nsize;
565	clhash->hashmask = nmask;
566	sch_tree_unlock(sch);
567
568	qdisc_class_hash_free(ohash, osize);
569}
570EXPORT_SYMBOL(qdisc_class_hash_grow);
571
572int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
573{
574	unsigned int size = 4;
575
576	clhash->hash = qdisc_class_hash_alloc(size);
577	if (clhash->hash == NULL)
578		return -ENOMEM;
579	clhash->hashsize  = size;
580	clhash->hashmask  = size - 1;
581	clhash->hashelems = 0;
582	return 0;
583}
584EXPORT_SYMBOL(qdisc_class_hash_init);
585
586void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
587{
588	qdisc_class_hash_free(clhash->hash, clhash->hashsize);
589}
590EXPORT_SYMBOL(qdisc_class_hash_destroy);
591
592void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
593			     struct Qdisc_class_common *cl)
594{
595	unsigned int h;
596
597	INIT_HLIST_NODE(&cl->hnode);
598	h = qdisc_class_hash(cl->classid, clhash->hashmask);
599	hlist_add_head(&cl->hnode, &clhash->hash[h]);
600	clhash->hashelems++;
601}
602EXPORT_SYMBOL(qdisc_class_hash_insert);
603
604void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
605			     struct Qdisc_class_common *cl)
606{
607	hlist_del(&cl->hnode);
608	clhash->hashelems--;
609}
610EXPORT_SYMBOL(qdisc_class_hash_remove);
611
612/* Allocate an unique handle from space managed by kernel */
613
614static u32 qdisc_alloc_handle(struct net_device *dev)
615{
616	int i = 0x10000;
617	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
618
619	do {
620		autohandle += TC_H_MAKE(0x10000U, 0);
621		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
622			autohandle = TC_H_MAKE(0x80000000U, 0);
623	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
624
625	return i>0 ? autohandle : 0;
626}
627
628void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
629{
630	const struct Qdisc_class_ops *cops;
631	unsigned long cl;
632	u32 parentid;
633
634	if (n == 0)
635		return;
636	while ((parentid = sch->parent)) {
637		if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
638			return;
639
640		sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
641		if (sch == NULL) {
642			WARN_ON(parentid != TC_H_ROOT);
643			return;
644		}
645		cops = sch->ops->cl_ops;
646		if (cops->qlen_notify) {
647			cl = cops->get(sch, parentid);
648			cops->qlen_notify(sch, cl);
649			cops->put(sch, cl);
650		}
651		sch->q.qlen -= n;
652	}
653}
654EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
655
656static void notify_and_destroy(struct net *net, struct sk_buff *skb,
657			       struct nlmsghdr *n, u32 clid,
658			       struct Qdisc *old, struct Qdisc *new)
659{
660	if (new || old)
661		qdisc_notify(net, skb, n, clid, old, new);
662
663	if (old)
664		qdisc_destroy(old);
665}
666
667/* Graft qdisc "new" to class "classid" of qdisc "parent" or
668 * to device "dev".
669 *
670 * When appropriate send a netlink notification using 'skb'
671 * and "n".
672 *
673 * On success, destroy old qdisc.
674 */
675
676static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
677		       struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
678		       struct Qdisc *new, struct Qdisc *old)
679{
680	struct Qdisc *q = old;
681	struct net *net = dev_net(dev);
682	int err = 0;
683
684	if (parent == NULL) {
685		unsigned int i, num_q, ingress;
686
687		ingress = 0;
688		num_q = dev->num_tx_queues;
689		if ((q && q->flags & TCQ_F_INGRESS) ||
690		    (new && new->flags & TCQ_F_INGRESS)) {
691			num_q = 1;
692			ingress = 1;
693		}
694
695		if (dev->flags & IFF_UP)
696			dev_deactivate(dev);
697
698		if (new && new->ops->attach) {
699			new->ops->attach(new);
700			num_q = 0;
701		}
702
703		for (i = 0; i < num_q; i++) {
704			struct netdev_queue *dev_queue = &dev->rx_queue;
705
706			if (!ingress)
707				dev_queue = netdev_get_tx_queue(dev, i);
708
709			old = dev_graft_qdisc(dev_queue, new);
710			if (new && i > 0)
711				atomic_inc(&new->refcnt);
712
713			if (!ingress)
714				qdisc_destroy(old);
715		}
716
717		if (!ingress) {
718			notify_and_destroy(net, skb, n, classid,
719					   dev->qdisc, new);
720			if (new && !new->ops->attach)
721				atomic_inc(&new->refcnt);
722			dev->qdisc = new ? : &noop_qdisc;
723		} else {
724			notify_and_destroy(net, skb, n, classid, old, new);
725		}
726
727		if (dev->flags & IFF_UP)
728			dev_activate(dev);
729	} else {
730		const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
731
732		err = -EOPNOTSUPP;
733		if (cops && cops->graft) {
734			unsigned long cl = cops->get(parent, classid);
735			if (cl) {
736				err = cops->graft(parent, cl, new, &old);
737				cops->put(parent, cl);
738			} else
739				err = -ENOENT;
740		}
741		if (!err)
742			notify_and_destroy(net, skb, n, classid, old, new);
743	}
744	return err;
745}
746
747/* lockdep annotation is needed for ingress; egress gets it only for name */
748static struct lock_class_key qdisc_tx_lock;
749static struct lock_class_key qdisc_rx_lock;
750
751/*
752   Allocate and initialize new qdisc.
753
754   Parameters are passed via opt.
755 */
756
757static struct Qdisc *
758qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
759	     struct Qdisc *p, u32 parent, u32 handle,
760	     struct nlattr **tca, int *errp)
761{
762	int err;
763	struct nlattr *kind = tca[TCA_KIND];
764	struct Qdisc *sch;
765	struct Qdisc_ops *ops;
766	struct qdisc_size_table *stab;
767
768	ops = qdisc_lookup_ops(kind);
769#ifdef CONFIG_MODULES
770	if (ops == NULL && kind != NULL) {
771		char name[IFNAMSIZ];
772		if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
773			/* We dropped the RTNL semaphore in order to
774			 * perform the module load.  So, even if we
775			 * succeeded in loading the module we have to
776			 * tell the caller to replay the request.  We
777			 * indicate this using -EAGAIN.
778			 * We replay the request because the device may
779			 * go away in the mean time.
780			 */
781			rtnl_unlock();
782			request_module("sch_%s", name);
783			rtnl_lock();
784			ops = qdisc_lookup_ops(kind);
785			if (ops != NULL) {
786				/* We will try again qdisc_lookup_ops,
787				 * so don't keep a reference.
788				 */
789				module_put(ops->owner);
790				err = -EAGAIN;
791				goto err_out;
792			}
793		}
794	}
795#endif
796
797	err = -ENOENT;
798	if (ops == NULL)
799		goto err_out;
800
801	sch = qdisc_alloc(dev_queue, ops);
802	if (IS_ERR(sch)) {
803		err = PTR_ERR(sch);
804		goto err_out2;
805	}
806
807	sch->parent = parent;
808
809	if (handle == TC_H_INGRESS) {
810		sch->flags |= TCQ_F_INGRESS;
811		handle = TC_H_MAKE(TC_H_INGRESS, 0);
812		lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
813	} else {
814		if (handle == 0) {
815			handle = qdisc_alloc_handle(dev);
816			err = -ENOMEM;
817			if (handle == 0)
818				goto err_out3;
819		}
820		lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
821	}
822
823	sch->handle = handle;
824
825	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
826		if (tca[TCA_STAB]) {
827			stab = qdisc_get_stab(tca[TCA_STAB]);
828			if (IS_ERR(stab)) {
829				err = PTR_ERR(stab);
830				goto err_out4;
831			}
832			sch->stab = stab;
833		}
834		if (tca[TCA_RATE]) {
835			spinlock_t *root_lock;
836
837			err = -EOPNOTSUPP;
838			if (sch->flags & TCQ_F_MQROOT)
839				goto err_out4;
840
841			if ((sch->parent != TC_H_ROOT) &&
842			    !(sch->flags & TCQ_F_INGRESS) &&
843			    (!p || !(p->flags & TCQ_F_MQROOT)))
844				root_lock = qdisc_root_sleeping_lock(sch);
845			else
846				root_lock = qdisc_lock(sch);
847
848			err = gen_new_estimator(&sch->bstats, &sch->rate_est,
849						root_lock, tca[TCA_RATE]);
850			if (err)
851				goto err_out4;
852		}
853
854		qdisc_list_add(sch);
855
856		return sch;
857	}
858err_out3:
859	dev_put(dev);
860	kfree((char *) sch - sch->padded);
861err_out2:
862	module_put(ops->owner);
863err_out:
864	*errp = err;
865	return NULL;
866
867err_out4:
868	/*
869	 * Any broken qdiscs that would require a ops->reset() here?
870	 * The qdisc was never in action so it shouldn't be necessary.
871	 */
872	qdisc_put_stab(sch->stab);
873	if (ops->destroy)
874		ops->destroy(sch);
875	goto err_out3;
876}
877
878static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
879{
880	struct qdisc_size_table *stab = NULL;
881	int err = 0;
882
883	if (tca[TCA_OPTIONS]) {
884		if (sch->ops->change == NULL)
885			return -EINVAL;
886		err = sch->ops->change(sch, tca[TCA_OPTIONS]);
887		if (err)
888			return err;
889	}
890
891	if (tca[TCA_STAB]) {
892		stab = qdisc_get_stab(tca[TCA_STAB]);
893		if (IS_ERR(stab))
894			return PTR_ERR(stab);
895	}
896
897	qdisc_put_stab(sch->stab);
898	sch->stab = stab;
899
900	if (tca[TCA_RATE]) {
901		/* NB: ignores errors from replace_estimator
902		   because change can't be undone. */
903		if (sch->flags & TCQ_F_MQROOT)
904			goto out;
905		gen_replace_estimator(&sch->bstats, &sch->rate_est,
906					    qdisc_root_sleeping_lock(sch),
907					    tca[TCA_RATE]);
908	}
909out:
910	return 0;
911}
912
913struct check_loop_arg
914{
915	struct qdisc_walker 	w;
916	struct Qdisc		*p;
917	int			depth;
918};
919
920static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
921
922static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
923{
924	struct check_loop_arg	arg;
925
926	if (q->ops->cl_ops == NULL)
927		return 0;
928
929	arg.w.stop = arg.w.skip = arg.w.count = 0;
930	arg.w.fn = check_loop_fn;
931	arg.depth = depth;
932	arg.p = p;
933	q->ops->cl_ops->walk(q, &arg.w);
934	return arg.w.stop ? -ELOOP : 0;
935}
936
937static int
938check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
939{
940	struct Qdisc *leaf;
941	const struct Qdisc_class_ops *cops = q->ops->cl_ops;
942	struct check_loop_arg *arg = (struct check_loop_arg *)w;
943
944	leaf = cops->leaf(q, cl);
945	if (leaf) {
946		if (leaf == arg->p || arg->depth > 7)
947			return -ELOOP;
948		return check_loop(leaf, arg->p, arg->depth + 1);
949	}
950	return 0;
951}
952
953/*
954 * Delete/get qdisc.
955 */
956
957static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
958{
959	struct net *net = sock_net(skb->sk);
960	struct tcmsg *tcm = NLMSG_DATA(n);
961	struct nlattr *tca[TCA_MAX + 1];
962	struct net_device *dev;
963	u32 clid = tcm->tcm_parent;
964	struct Qdisc *q = NULL;
965	struct Qdisc *p = NULL;
966	int err;
967
968	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
969		return -ENODEV;
970
971	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
972	if (err < 0)
973		return err;
974
975	if (clid) {
976		if (clid != TC_H_ROOT) {
977			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
978				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
979					return -ENOENT;
980				q = qdisc_leaf(p, clid);
981			} else { /* ingress */
982				q = dev->rx_queue.qdisc_sleeping;
983			}
984		} else {
985			q = dev->qdisc;
986		}
987		if (!q)
988			return -ENOENT;
989
990		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
991			return -EINVAL;
992	} else {
993		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
994			return -ENOENT;
995	}
996
997	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
998		return -EINVAL;
999
1000	if (n->nlmsg_type == RTM_DELQDISC) {
1001		if (!clid)
1002			return -EINVAL;
1003		if (q->handle == 0)
1004			return -ENOENT;
1005		if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1006			return err;
1007	} else {
1008		qdisc_notify(net, skb, n, clid, NULL, q);
1009	}
1010	return 0;
1011}
1012
1013/*
1014   Create/change qdisc.
1015 */
1016
1017static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1018{
1019	struct net *net = sock_net(skb->sk);
1020	struct tcmsg *tcm;
1021	struct nlattr *tca[TCA_MAX + 1];
1022	struct net_device *dev;
1023	u32 clid;
1024	struct Qdisc *q, *p;
1025	int err;
1026
1027replay:
1028	/* Reinit, just in case something touches this. */
1029	tcm = NLMSG_DATA(n);
1030	clid = tcm->tcm_parent;
1031	q = p = NULL;
1032
1033	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1034		return -ENODEV;
1035
1036	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1037	if (err < 0)
1038		return err;
1039
1040	if (clid) {
1041		if (clid != TC_H_ROOT) {
1042			if (clid != TC_H_INGRESS) {
1043				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1044					return -ENOENT;
1045				q = qdisc_leaf(p, clid);
1046			} else { /*ingress */
1047				q = dev->rx_queue.qdisc_sleeping;
1048			}
1049		} else {
1050			q = dev->qdisc;
1051		}
1052
1053		/* It may be default qdisc, ignore it */
1054		if (q && q->handle == 0)
1055			q = NULL;
1056
1057		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1058			if (tcm->tcm_handle) {
1059				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1060					return -EEXIST;
1061				if (TC_H_MIN(tcm->tcm_handle))
1062					return -EINVAL;
1063				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1064					goto create_n_graft;
1065				if (n->nlmsg_flags&NLM_F_EXCL)
1066					return -EEXIST;
1067				if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1068					return -EINVAL;
1069				if (q == p ||
1070				    (p && check_loop(q, p, 0)))
1071					return -ELOOP;
1072				atomic_inc(&q->refcnt);
1073				goto graft;
1074			} else {
1075				if (q == NULL)
1076					goto create_n_graft;
1077
1078				/* This magic test requires explanation.
1079				 *
1080				 *   We know, that some child q is already
1081				 *   attached to this parent and have choice:
1082				 *   either to change it or to create/graft new one.
1083				 *
1084				 *   1. We are allowed to create/graft only
1085				 *   if CREATE and REPLACE flags are set.
1086				 *
1087				 *   2. If EXCL is set, requestor wanted to say,
1088				 *   that qdisc tcm_handle is not expected
1089				 *   to exist, so that we choose create/graft too.
1090				 *
1091				 *   3. The last case is when no flags are set.
1092				 *   Alas, it is sort of hole in API, we
1093				 *   cannot decide what to do unambiguously.
1094				 *   For now we select create/graft, if
1095				 *   user gave KIND, which does not match existing.
1096				 */
1097				if ((n->nlmsg_flags&NLM_F_CREATE) &&
1098				    (n->nlmsg_flags&NLM_F_REPLACE) &&
1099				    ((n->nlmsg_flags&NLM_F_EXCL) ||
1100				     (tca[TCA_KIND] &&
1101				      nla_strcmp(tca[TCA_KIND], q->ops->id))))
1102					goto create_n_graft;
1103			}
1104		}
1105	} else {
1106		if (!tcm->tcm_handle)
1107			return -EINVAL;
1108		q = qdisc_lookup(dev, tcm->tcm_handle);
1109	}
1110
1111	/* Change qdisc parameters */
1112	if (q == NULL)
1113		return -ENOENT;
1114	if (n->nlmsg_flags&NLM_F_EXCL)
1115		return -EEXIST;
1116	if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1117		return -EINVAL;
1118	err = qdisc_change(q, tca);
1119	if (err == 0)
1120		qdisc_notify(net, skb, n, clid, NULL, q);
1121	return err;
1122
1123create_n_graft:
1124	if (!(n->nlmsg_flags&NLM_F_CREATE))
1125		return -ENOENT;
1126	if (clid == TC_H_INGRESS)
1127		q = qdisc_create(dev, &dev->rx_queue, p,
1128				 tcm->tcm_parent, tcm->tcm_parent,
1129				 tca, &err);
1130	else {
1131		struct netdev_queue *dev_queue;
1132
1133		if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1134			dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1135		else if (p)
1136			dev_queue = p->dev_queue;
1137		else
1138			dev_queue = netdev_get_tx_queue(dev, 0);
1139
1140		q = qdisc_create(dev, dev_queue, p,
1141				 tcm->tcm_parent, tcm->tcm_handle,
1142				 tca, &err);
1143	}
1144	if (q == NULL) {
1145		if (err == -EAGAIN)
1146			goto replay;
1147		return err;
1148	}
1149
1150graft:
1151	err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1152	if (err) {
1153		if (q)
1154			qdisc_destroy(q);
1155		return err;
1156	}
1157
1158	return 0;
1159}
1160
1161static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1162			 u32 pid, u32 seq, u16 flags, int event)
1163{
1164	struct tcmsg *tcm;
1165	struct nlmsghdr  *nlh;
1166	unsigned char *b = skb_tail_pointer(skb);
1167	struct gnet_dump d;
1168
1169	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1170	tcm = NLMSG_DATA(nlh);
1171	tcm->tcm_family = AF_UNSPEC;
1172	tcm->tcm__pad1 = 0;
1173	tcm->tcm__pad2 = 0;
1174	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1175	tcm->tcm_parent = clid;
1176	tcm->tcm_handle = q->handle;
1177	tcm->tcm_info = atomic_read(&q->refcnt);
1178	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1179	if (q->ops->dump && q->ops->dump(q, skb) < 0)
1180		goto nla_put_failure;
1181	q->qstats.qlen = q->q.qlen;
1182
1183	if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1184		goto nla_put_failure;
1185
1186	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1187					 qdisc_root_sleeping_lock(q), &d) < 0)
1188		goto nla_put_failure;
1189
1190	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1191		goto nla_put_failure;
1192
1193	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1194	    gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1195	    gnet_stats_copy_queue(&d, &q->qstats) < 0)
1196		goto nla_put_failure;
1197
1198	if (gnet_stats_finish_copy(&d) < 0)
1199		goto nla_put_failure;
1200
1201	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1202	return skb->len;
1203
1204nlmsg_failure:
1205nla_put_failure:
1206	nlmsg_trim(skb, b);
1207	return -1;
1208}
1209
1210static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1211{
1212	return (q->flags & TCQ_F_BUILTIN) ? true : false;
1213}
1214
1215static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1216			struct nlmsghdr *n, u32 clid,
1217			struct Qdisc *old, struct Qdisc *new)
1218{
1219	struct sk_buff *skb;
1220	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1221
1222	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1223	if (!skb)
1224		return -ENOBUFS;
1225
1226	if (old && !tc_qdisc_dump_ignore(old)) {
1227		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1228			goto err_out;
1229	}
1230	if (new && !tc_qdisc_dump_ignore(new)) {
1231		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1232			goto err_out;
1233	}
1234
1235	if (skb->len)
1236		return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1237
1238err_out:
1239	kfree_skb(skb);
1240	return -EINVAL;
1241}
1242
1243static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1244			      struct netlink_callback *cb,
1245			      int *q_idx_p, int s_q_idx)
1246{
1247	int ret = 0, q_idx = *q_idx_p;
1248	struct Qdisc *q;
1249
1250	if (!root)
1251		return 0;
1252
1253	q = root;
1254	if (q_idx < s_q_idx) {
1255		q_idx++;
1256	} else {
1257		if (!tc_qdisc_dump_ignore(q) &&
1258		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1259				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1260			goto done;
1261		q_idx++;
1262	}
1263	list_for_each_entry(q, &root->list, list) {
1264		if (q_idx < s_q_idx) {
1265			q_idx++;
1266			continue;
1267		}
1268		if (!tc_qdisc_dump_ignore(q) &&
1269		    tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1270				  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1271			goto done;
1272		q_idx++;
1273	}
1274
1275out:
1276	*q_idx_p = q_idx;
1277	return ret;
1278done:
1279	ret = -1;
1280	goto out;
1281}
1282
1283static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1284{
1285	struct net *net = sock_net(skb->sk);
1286	int idx, q_idx;
1287	int s_idx, s_q_idx;
1288	struct net_device *dev;
1289
1290	s_idx = cb->args[0];
1291	s_q_idx = q_idx = cb->args[1];
1292
1293	rcu_read_lock();
1294	idx = 0;
1295	for_each_netdev_rcu(net, dev) {
1296		struct netdev_queue *dev_queue;
1297
1298		if (idx < s_idx)
1299			goto cont;
1300		if (idx > s_idx)
1301			s_q_idx = 0;
1302		q_idx = 0;
1303
1304		if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1305			goto done;
1306
1307		dev_queue = &dev->rx_queue;
1308		if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0)
1309			goto done;
1310
1311cont:
1312		idx++;
1313	}
1314
1315done:
1316	rcu_read_unlock();
1317
1318	cb->args[0] = idx;
1319	cb->args[1] = q_idx;
1320
1321	return skb->len;
1322}
1323
1324
1325
1326/************************************************
1327 *	Traffic classes manipulation.		*
1328 ************************************************/
1329
1330
1331
1332static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1333{
1334	struct net *net = sock_net(skb->sk);
1335	struct tcmsg *tcm = NLMSG_DATA(n);
1336	struct nlattr *tca[TCA_MAX + 1];
1337	struct net_device *dev;
1338	struct Qdisc *q = NULL;
1339	const struct Qdisc_class_ops *cops;
1340	unsigned long cl = 0;
1341	unsigned long new_cl;
1342	u32 pid = tcm->tcm_parent;
1343	u32 clid = tcm->tcm_handle;
1344	u32 qid = TC_H_MAJ(clid);
1345	int err;
1346
1347	if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1348		return -ENODEV;
1349
1350	err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1351	if (err < 0)
1352		return err;
1353
1354	/*
1355	   parent == TC_H_UNSPEC - unspecified parent.
1356	   parent == TC_H_ROOT   - class is root, which has no parent.
1357	   parent == X:0	 - parent is root class.
1358	   parent == X:Y	 - parent is a node in hierarchy.
1359	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
1360
1361	   handle == 0:0	 - generate handle from kernel pool.
1362	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
1363	   handle == X:Y	 - clear.
1364	   handle == X:0	 - root class.
1365	 */
1366
1367	/* Step 1. Determine qdisc handle X:0 */
1368
1369	if (pid != TC_H_ROOT) {
1370		u32 qid1 = TC_H_MAJ(pid);
1371
1372		if (qid && qid1) {
1373			/* If both majors are known, they must be identical. */
1374			if (qid != qid1)
1375				return -EINVAL;
1376		} else if (qid1) {
1377			qid = qid1;
1378		} else if (qid == 0)
1379			qid = dev->qdisc->handle;
1380
1381		/* Now qid is genuine qdisc handle consistent
1382		   both with parent and child.
1383
1384		   TC_H_MAJ(pid) still may be unspecified, complete it now.
1385		 */
1386		if (pid)
1387			pid = TC_H_MAKE(qid, pid);
1388	} else {
1389		if (qid == 0)
1390			qid = dev->qdisc->handle;
1391	}
1392
1393	/* OK. Locate qdisc */
1394	if ((q = qdisc_lookup(dev, qid)) == NULL)
1395		return -ENOENT;
1396
1397	/* An check that it supports classes */
1398	cops = q->ops->cl_ops;
1399	if (cops == NULL)
1400		return -EINVAL;
1401
1402	/* Now try to get class */
1403	if (clid == 0) {
1404		if (pid == TC_H_ROOT)
1405			clid = qid;
1406	} else
1407		clid = TC_H_MAKE(qid, clid);
1408
1409	if (clid)
1410		cl = cops->get(q, clid);
1411
1412	if (cl == 0) {
1413		err = -ENOENT;
1414		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1415			goto out;
1416	} else {
1417		switch (n->nlmsg_type) {
1418		case RTM_NEWTCLASS:
1419			err = -EEXIST;
1420			if (n->nlmsg_flags&NLM_F_EXCL)
1421				goto out;
1422			break;
1423		case RTM_DELTCLASS:
1424			err = -EOPNOTSUPP;
1425			if (cops->delete)
1426				err = cops->delete(q, cl);
1427			if (err == 0)
1428				tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1429			goto out;
1430		case RTM_GETTCLASS:
1431			err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1432			goto out;
1433		default:
1434			err = -EINVAL;
1435			goto out;
1436		}
1437	}
1438
1439	new_cl = cl;
1440	err = -EOPNOTSUPP;
1441	if (cops->change)
1442		err = cops->change(q, clid, pid, tca, &new_cl);
1443	if (err == 0)
1444		tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1445
1446out:
1447	if (cl)
1448		cops->put(q, cl);
1449
1450	return err;
1451}
1452
1453
1454static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1455			  unsigned long cl,
1456			  u32 pid, u32 seq, u16 flags, int event)
1457{
1458	struct tcmsg *tcm;
1459	struct nlmsghdr  *nlh;
1460	unsigned char *b = skb_tail_pointer(skb);
1461	struct gnet_dump d;
1462	const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1463
1464	nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1465	tcm = NLMSG_DATA(nlh);
1466	tcm->tcm_family = AF_UNSPEC;
1467	tcm->tcm__pad1 = 0;
1468	tcm->tcm__pad2 = 0;
1469	tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1470	tcm->tcm_parent = q->handle;
1471	tcm->tcm_handle = q->handle;
1472	tcm->tcm_info = 0;
1473	NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1474	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1475		goto nla_put_failure;
1476
1477	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1478					 qdisc_root_sleeping_lock(q), &d) < 0)
1479		goto nla_put_failure;
1480
1481	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1482		goto nla_put_failure;
1483
1484	if (gnet_stats_finish_copy(&d) < 0)
1485		goto nla_put_failure;
1486
1487	nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1488	return skb->len;
1489
1490nlmsg_failure:
1491nla_put_failure:
1492	nlmsg_trim(skb, b);
1493	return -1;
1494}
1495
1496static int tclass_notify(struct net *net, struct sk_buff *oskb,
1497			 struct nlmsghdr *n, struct Qdisc *q,
1498			 unsigned long cl, int event)
1499{
1500	struct sk_buff *skb;
1501	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1502
1503	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1504	if (!skb)
1505		return -ENOBUFS;
1506
1507	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1508		kfree_skb(skb);
1509		return -EINVAL;
1510	}
1511
1512	return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1513}
1514
1515struct qdisc_dump_args
1516{
1517	struct qdisc_walker w;
1518	struct sk_buff *skb;
1519	struct netlink_callback *cb;
1520};
1521
1522static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1523{
1524	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1525
1526	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1527			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1528}
1529
1530static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1531				struct tcmsg *tcm, struct netlink_callback *cb,
1532				int *t_p, int s_t)
1533{
1534	struct qdisc_dump_args arg;
1535
1536	if (tc_qdisc_dump_ignore(q) ||
1537	    *t_p < s_t || !q->ops->cl_ops ||
1538	    (tcm->tcm_parent &&
1539	     TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1540		(*t_p)++;
1541		return 0;
1542	}
1543	if (*t_p > s_t)
1544		memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1545	arg.w.fn = qdisc_class_dump;
1546	arg.skb = skb;
1547	arg.cb = cb;
1548	arg.w.stop  = 0;
1549	arg.w.skip = cb->args[1];
1550	arg.w.count = 0;
1551	q->ops->cl_ops->walk(q, &arg.w);
1552	cb->args[1] = arg.w.count;
1553	if (arg.w.stop)
1554		return -1;
1555	(*t_p)++;
1556	return 0;
1557}
1558
1559static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1560			       struct tcmsg *tcm, struct netlink_callback *cb,
1561			       int *t_p, int s_t)
1562{
1563	struct Qdisc *q;
1564
1565	if (!root)
1566		return 0;
1567
1568	if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1569		return -1;
1570
1571	list_for_each_entry(q, &root->list, list) {
1572		if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1573			return -1;
1574	}
1575
1576	return 0;
1577}
1578
1579static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1580{
1581	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1582	struct net *net = sock_net(skb->sk);
1583	struct netdev_queue *dev_queue;
1584	struct net_device *dev;
1585	int t, s_t;
1586
1587	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1588		return 0;
1589	if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1590		return 0;
1591
1592	s_t = cb->args[0];
1593	t = 0;
1594
1595	if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1596		goto done;
1597
1598	dev_queue = &dev->rx_queue;
1599	if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0)
1600		goto done;
1601
1602done:
1603	cb->args[0] = t;
1604
1605	dev_put(dev);
1606	return skb->len;
1607}
1608
1609/* Main classifier routine: scans classifier chain attached
1610   to this qdisc, (optionally) tests for protocol and asks
1611   specific classifiers.
1612 */
1613int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1614		       struct tcf_result *res)
1615{
1616	__be16 protocol = skb->protocol;
1617	int err = 0;
1618
1619	for (; tp; tp = tp->next) {
1620		if ((tp->protocol == protocol ||
1621		     tp->protocol == htons(ETH_P_ALL)) &&
1622		    (err = tp->classify(skb, tp, res)) >= 0) {
1623#ifdef CONFIG_NET_CLS_ACT
1624			if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1625				skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1626#endif
1627			return err;
1628		}
1629	}
1630	return -1;
1631}
1632EXPORT_SYMBOL(tc_classify_compat);
1633
1634int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1635		struct tcf_result *res)
1636{
1637	int err = 0;
1638	__be16 protocol;
1639#ifdef CONFIG_NET_CLS_ACT
1640	struct tcf_proto *otp = tp;
1641reclassify:
1642#endif
1643	protocol = skb->protocol;
1644
1645	err = tc_classify_compat(skb, tp, res);
1646#ifdef CONFIG_NET_CLS_ACT
1647	if (err == TC_ACT_RECLASSIFY) {
1648		u32 verd = G_TC_VERD(skb->tc_verd);
1649		tp = otp;
1650
1651		if (verd++ >= MAX_REC_LOOP) {
1652			if (net_ratelimit())
1653				printk(KERN_NOTICE
1654				       "%s: packet reclassify loop"
1655					  " rule prio %u protocol %02x\n",
1656				       tp->q->ops->id,
1657				       tp->prio & 0xffff, ntohs(tp->protocol));
1658			return TC_ACT_SHOT;
1659		}
1660		skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1661		goto reclassify;
1662	}
1663#endif
1664	return err;
1665}
1666EXPORT_SYMBOL(tc_classify);
1667
1668void tcf_destroy(struct tcf_proto *tp)
1669{
1670	tp->ops->destroy(tp);
1671	module_put(tp->ops->owner);
1672	kfree(tp);
1673}
1674
1675void tcf_destroy_chain(struct tcf_proto **fl)
1676{
1677	struct tcf_proto *tp;
1678
1679	while ((tp = *fl) != NULL) {
1680		*fl = tp->next;
1681		tcf_destroy(tp);
1682	}
1683}
1684EXPORT_SYMBOL(tcf_destroy_chain);
1685
1686#ifdef CONFIG_PROC_FS
1687static int psched_show(struct seq_file *seq, void *v)
1688{
1689	struct timespec ts;
1690
1691	hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1692	seq_printf(seq, "%08x %08x %08x %08x\n",
1693		   (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1694		   1000000,
1695		   (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1696
1697	return 0;
1698}
1699
1700static int psched_open(struct inode *inode, struct file *file)
1701{
1702	return single_open(file, psched_show, NULL);
1703}
1704
1705static const struct file_operations psched_fops = {
1706	.owner = THIS_MODULE,
1707	.open = psched_open,
1708	.read  = seq_read,
1709	.llseek = seq_lseek,
1710	.release = single_release,
1711};
1712
1713static int __net_init psched_net_init(struct net *net)
1714{
1715	struct proc_dir_entry *e;
1716
1717	e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1718	if (e == NULL)
1719		return -ENOMEM;
1720
1721	return 0;
1722}
1723
1724static void __net_exit psched_net_exit(struct net *net)
1725{
1726	proc_net_remove(net, "psched");
1727}
1728#else
1729static int __net_init psched_net_init(struct net *net)
1730{
1731	return 0;
1732}
1733
1734static void __net_exit psched_net_exit(struct net *net)
1735{
1736}
1737#endif
1738
1739static struct pernet_operations psched_net_ops = {
1740	.init = psched_net_init,
1741	.exit = psched_net_exit,
1742};
1743
1744static int __init pktsched_init(void)
1745{
1746	int err;
1747
1748	err = register_pernet_subsys(&psched_net_ops);
1749	if (err) {
1750		printk(KERN_ERR "pktsched_init: "
1751		       "cannot initialize per netns operations\n");
1752		return err;
1753	}
1754
1755	register_qdisc(&pfifo_qdisc_ops);
1756	register_qdisc(&bfifo_qdisc_ops);
1757	register_qdisc(&pfifo_head_drop_qdisc_ops);
1758	register_qdisc(&mq_qdisc_ops);
1759
1760	rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1761	rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1762	rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1763	rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1764	rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1765	rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1766
1767	return 0;
1768}
1769
1770subsys_initcall(pktsched_init);
1771