1/*
2 * net/sched/sch_api.c	Packet scheduler API.
3 *
4 *		This program is free software; you can redistribute it and/or
5 *		modify it under the terms of the GNU General Public License
6 *		as published by the Free Software Foundation; either version
7 *		2 of the License, or (at your option) any later version.
8 *
9 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
18#include <linux/config.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
21#include <linux/sched.h>
22#include <linux/string.h>
23#include <linux/mm.h>
24#include <linux/socket.h>
25#include <linux/sockios.h>
26#include <linux/in.h>
27#include <linux/errno.h>
28#include <linux/interrupt.h>
29#include <linux/netdevice.h>
30#include <linux/skbuff.h>
31#include <linux/rtnetlink.h>
32#include <linux/init.h>
33#include <linux/proc_fs.h>
34#include <linux/kmod.h>
35
36#include <net/sock.h>
37#include <net/pkt_sched.h>
38
39#include <asm/processor.h>
40#include <asm/uaccess.h>
41#include <asm/system.h>
42#include <asm/bitops.h>
43
44static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid,
45			struct Qdisc *old, struct Qdisc *new);
46static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
47			 struct Qdisc *q, unsigned long cl, int event);
48
49/*
50
51   Short review.
52   -------------
53
54   This file consists of two interrelated parts:
55
56   1. queueing disciplines manager frontend.
57   2. traffic classes manager frontend.
58
59   Generally, queueing discipline ("qdisc") is a black box,
60   which is able to enqueue packets and to dequeue them (when
61   device is ready to send something) in order and at times
62   determined by algorithm hidden in it.
63
64   qdisc's are divided to two categories:
65   - "queues", which have no internal structure visible from outside.
66   - "schedulers", which split all the packets to "traffic classes",
67     using "packet classifiers" (look at cls_api.c)
68
69   In turn, classes may have child qdiscs (as rule, queues)
70   attached to them etc. etc. etc.
71
72   The goal of the routines in this file is to translate
73   information supplied by user in the form of handles
74   to more intelligible for kernel form, to make some sanity
75   checks and part of work, which is common to all qdiscs
76   and to provide rtnetlink notifications.
77
78   All real intelligent work is done inside qdisc modules.
79
80
81
82   Every discipline has two major routines: enqueue and dequeue.
83
84   ---dequeue
85
86   dequeue usually returns a skb to send. It is allowed to return NULL,
87   but it does not mean that queue is empty, it just means that
88   discipline does not want to send anything this time.
89   Queue is really empty if q->q.qlen == 0.
90   For complicated disciplines with multiple queues q->q is not
91   real packet queue, but however q->q.qlen must be valid.
92
93   ---enqueue
94
95   enqueue returns 0, if packet was enqueued successfully.
96   If packet (this one or another one) was dropped, it returns
97   not zero error code.
98   NET_XMIT_DROP 	- this packet dropped
99     Expected action: do not backoff, but wait until queue will clear.
100   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped.
101     Expected action: backoff or ignore
102   NET_XMIT_POLICED	- dropped by police.
103     Expected action: backoff or error to real-time apps.
104
105   Auxiliary routines:
106
107   ---requeue
108
109   requeues once dequeued packet. It is used for non-standard or
110   just buggy devices, which can defer output even if dev->tbusy=0.
111
112   ---reset
113
114   returns qdisc to initial state: purge all buffers, clear all
115   timers, counters (except for statistics) etc.
116
117   ---init
118
119   initializes newly created qdisc.
120
121   ---destroy
122
123   destroys resources allocated by init and during lifetime of qdisc.
124
125   ---change
126
127   changes qdisc parameters.
128 */
129
130/* Protects list of registered TC modules. It is pure SMP lock. */
131static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED;
132
133
134/************************************************
135 *	Queueing disciplines manipulation.	*
136 ************************************************/
137
138
139/* The list of all installed queueing disciplines. */
140
141static struct Qdisc_ops *qdisc_base = NULL;
142
143/* Register/uregister queueing discipline */
144
145int register_qdisc(struct Qdisc_ops *qops)
146{
147	struct Qdisc_ops *q, **qp;
148
149	write_lock(&qdisc_mod_lock);
150	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) {
151		if (strcmp(qops->id, q->id) == 0) {
152			write_unlock(&qdisc_mod_lock);
153			return -EEXIST;
154		}
155	}
156
157	if (qops->enqueue == NULL)
158		qops->enqueue = noop_qdisc_ops.enqueue;
159	if (qops->requeue == NULL)
160		qops->requeue = noop_qdisc_ops.requeue;
161	if (qops->dequeue == NULL)
162		qops->dequeue = noop_qdisc_ops.dequeue;
163
164	qops->next = NULL;
165	*qp = qops;
166	write_unlock(&qdisc_mod_lock);
167	return 0;
168}
169
170int unregister_qdisc(struct Qdisc_ops *qops)
171{
172	struct Qdisc_ops *q, **qp;
173	int err = -ENOENT;
174
175	write_lock(&qdisc_mod_lock);
176	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
177		if (q == qops)
178			break;
179	if (q) {
180		*qp = q->next;
181		q->next = NULL;
182		err = 0;
183	}
184	write_unlock(&qdisc_mod_lock);
185	return err;
186}
187
188/* We know handle. Find qdisc among all qdisc's attached to device
189   (root qdisc, all its children, children of children etc.)
190 */
191
192struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
193{
194	struct Qdisc *q;
195
196	for (q = dev->qdisc_list; q; q = q->next) {
197		if (q->handle == handle)
198			return q;
199	}
200	return NULL;
201}
202
203struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
204{
205	unsigned long cl;
206	struct Qdisc *leaf;
207	struct Qdisc_class_ops *cops = p->ops->cl_ops;
208
209	if (cops == NULL)
210		return NULL;
211	cl = cops->get(p, classid);
212
213	if (cl == 0)
214		return NULL;
215	leaf = cops->leaf(p, cl);
216	cops->put(p, cl);
217	return leaf;
218}
219
220/* Find queueing discipline by name */
221
222struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind)
223{
224	struct Qdisc_ops *q = NULL;
225
226	if (kind) {
227		read_lock(&qdisc_mod_lock);
228		for (q = qdisc_base; q; q = q->next) {
229			if (rtattr_strcmp(kind, q->id) == 0)
230				break;
231		}
232		read_unlock(&qdisc_mod_lock);
233	}
234	return q;
235}
236
237static struct qdisc_rate_table *qdisc_rtab_list;
238
239struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab)
240{
241	struct qdisc_rate_table *rtab;
242
243	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
244		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
245			rtab->refcnt++;
246			return rtab;
247		}
248	}
249
250	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024)
251		return NULL;
252
253	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
254	if (rtab) {
255		rtab->rate = *r;
256		rtab->refcnt = 1;
257		memcpy(rtab->data, RTA_DATA(tab), 1024);
258		rtab->next = qdisc_rtab_list;
259		qdisc_rtab_list = rtab;
260	}
261	return rtab;
262}
263
264void qdisc_put_rtab(struct qdisc_rate_table *tab)
265{
266	struct qdisc_rate_table *rtab, **rtabp;
267
268	if (!tab || --tab->refcnt)
269		return;
270
271	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
272		if (rtab == tab) {
273			*rtabp = rtab->next;
274			kfree(rtab);
275			return;
276		}
277	}
278}
279
280
281/* Allocate an unique handle from space managed by kernel */
282
283u32 qdisc_alloc_handle(struct net_device *dev)
284{
285	int i = 0x10000;
286	static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
287
288	do {
289		autohandle += TC_H_MAKE(0x10000U, 0);
290		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
291			autohandle = TC_H_MAKE(0x80000000U, 0);
292	} while	(qdisc_lookup(dev, autohandle) && --i > 0);
293
294	return i>0 ? autohandle : 0;
295}
296
297/* Attach toplevel qdisc to device dev */
298
299static struct Qdisc *
300dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc)
301{
302	struct Qdisc *oqdisc;
303
304	if (dev->flags & IFF_UP)
305		dev_deactivate(dev);
306
307	write_lock(&qdisc_tree_lock);
308	spin_lock_bh(&dev->queue_lock);
309	if (qdisc && qdisc->flags&TCQ_F_INGRES) {
310		oqdisc = dev->qdisc_ingress;
311		/* Prune old scheduler */
312		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) {
313			/* delete */
314			qdisc_reset(oqdisc);
315			dev->qdisc_ingress = NULL;
316		} else {  /* new */
317			dev->qdisc_ingress = qdisc;
318		}
319
320	} else {
321
322		oqdisc = dev->qdisc_sleeping;
323
324		/* Prune old scheduler */
325		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
326			qdisc_reset(oqdisc);
327
328		/* ... and graft new one */
329		if (qdisc == NULL)
330			qdisc = &noop_qdisc;
331		dev->qdisc_sleeping = qdisc;
332		dev->qdisc = &noop_qdisc;
333	}
334
335	spin_unlock_bh(&dev->queue_lock);
336	write_unlock(&qdisc_tree_lock);
337
338	if (dev->flags & IFF_UP)
339		dev_activate(dev);
340
341	return oqdisc;
342}
343
344
345/* Graft qdisc "new" to class "classid" of qdisc "parent" or
346   to device "dev".
347
348   Old qdisc is not destroyed but returned in *old.
349 */
350
351int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid,
352		struct Qdisc *new, struct Qdisc **old)
353{
354	int err = 0;
355	struct Qdisc *q = *old;
356
357
358	if (parent == NULL) {
359		if (q && q->flags&TCQ_F_INGRES) {
360			*old = dev_graft_qdisc(dev, q);
361		} else {
362			*old = dev_graft_qdisc(dev, new);
363		}
364	} else {
365		struct Qdisc_class_ops *cops = parent->ops->cl_ops;
366
367		err = -EINVAL;
368
369		if (cops) {
370			unsigned long cl = cops->get(parent, classid);
371			if (cl) {
372				err = cops->graft(parent, cl, new, old);
373				cops->put(parent, cl);
374			}
375		}
376	}
377	return err;
378}
379
380/*
381   Allocate and initialize new qdisc.
382
383   Parameters are passed via opt.
384 */
385
386static struct Qdisc *
387qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp)
388{
389	int err;
390	struct rtattr *kind = tca[TCA_KIND-1];
391	struct Qdisc *sch = NULL;
392	struct Qdisc_ops *ops;
393	int size;
394
395	ops = qdisc_lookup_ops(kind);
396#ifdef CONFIG_KMOD
397	if (ops==NULL && tca[TCA_KIND-1] != NULL) {
398		char module_name[4 + IFNAMSIZ + 1];
399
400		if (RTA_PAYLOAD(kind) <= IFNAMSIZ) {
401			sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind));
402			request_module (module_name);
403			ops = qdisc_lookup_ops(kind);
404		}
405	}
406#endif
407
408	err = -EINVAL;
409	if (ops == NULL)
410		goto err_out;
411
412	size = sizeof(*sch) + ops->priv_size;
413
414	sch = kmalloc(size, GFP_KERNEL);
415	err = -ENOBUFS;
416	if (!sch)
417		goto err_out;
418
419	/* Grrr... Resolve race condition with module unload */
420
421	err = -EINVAL;
422	if (ops != qdisc_lookup_ops(kind))
423		goto err_out;
424
425	memset(sch, 0, size);
426
427	skb_queue_head_init(&sch->q);
428
429	if (handle == TC_H_INGRESS)
430		sch->flags |= TCQ_F_INGRES;
431
432	sch->ops = ops;
433	sch->enqueue = ops->enqueue;
434	sch->dequeue = ops->dequeue;
435	sch->dev = dev;
436	atomic_set(&sch->refcnt, 1);
437	sch->stats.lock = &dev->queue_lock;
438	if (handle == 0) {
439		handle = qdisc_alloc_handle(dev);
440		err = -ENOMEM;
441		if (handle == 0)
442			goto err_out;
443	}
444
445	if (handle == TC_H_INGRESS)
446                sch->handle =TC_H_MAKE(TC_H_INGRESS, 0);
447        else
448                sch->handle = handle;
449
450	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) {
451		write_lock(&qdisc_tree_lock);
452		sch->next = dev->qdisc_list;
453		dev->qdisc_list = sch;
454		write_unlock(&qdisc_tree_lock);
455#ifdef CONFIG_NET_ESTIMATOR
456		if (tca[TCA_RATE-1])
457			qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
458#endif
459		return sch;
460	}
461
462err_out:
463	*errp = err;
464	if (sch)
465		kfree(sch);
466	return NULL;
467}
468
469static int qdisc_change(struct Qdisc *sch, struct rtattr **tca)
470{
471	if (tca[TCA_OPTIONS-1]) {
472		int err;
473
474		if (sch->ops->change == NULL)
475			return -EINVAL;
476		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]);
477		if (err)
478			return err;
479	}
480#ifdef CONFIG_NET_ESTIMATOR
481	if (tca[TCA_RATE-1]) {
482		qdisc_kill_estimator(&sch->stats);
483		qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]);
484	}
485#endif
486	return 0;
487}
488
489struct check_loop_arg
490{
491	struct qdisc_walker 	w;
492	struct Qdisc		*p;
493	int			depth;
494};
495
496static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
497
498static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
499{
500	struct check_loop_arg	arg;
501
502	if (q->ops->cl_ops == NULL)
503		return 0;
504
505	arg.w.stop = arg.w.skip = arg.w.count = 0;
506	arg.w.fn = check_loop_fn;
507	arg.depth = depth;
508	arg.p = p;
509	q->ops->cl_ops->walk(q, &arg.w);
510	return arg.w.stop ? -ELOOP : 0;
511}
512
513static int
514check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
515{
516	struct Qdisc *leaf;
517	struct Qdisc_class_ops *cops = q->ops->cl_ops;
518	struct check_loop_arg *arg = (struct check_loop_arg *)w;
519
520	leaf = cops->leaf(q, cl);
521	if (leaf) {
522		if (leaf == arg->p || arg->depth > 7)
523			return -ELOOP;
524		return check_loop(leaf, arg->p, arg->depth + 1);
525	}
526	return 0;
527}
528
529/*
530 * Delete/get qdisc.
531 */
532
533static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
534{
535	struct tcmsg *tcm = NLMSG_DATA(n);
536	struct rtattr **tca = arg;
537	struct net_device *dev;
538	u32 clid = tcm->tcm_parent;
539	struct Qdisc *q = NULL;
540	struct Qdisc *p = NULL;
541	int err;
542
543	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
544		return -ENODEV;
545
546	if (clid) {
547		if (clid != TC_H_ROOT) {
548			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
549				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
550					return -ENOENT;
551				q = qdisc_leaf(p, clid);
552			} else { /* ingress */
553				q = dev->qdisc_ingress;
554                        }
555		} else {
556			q = dev->qdisc_sleeping;
557		}
558		if (!q)
559			return -ENOENT;
560
561		if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
562			return -EINVAL;
563	} else {
564		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
565			return -ENOENT;
566	}
567
568	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
569		return -EINVAL;
570
571	if (n->nlmsg_type == RTM_DELQDISC) {
572		if (!clid)
573			return -EINVAL;
574		if (q->handle == 0)
575			return -ENOENT;
576		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0)
577			return err;
578		if (q) {
579			qdisc_notify(skb, n, clid, q, NULL);
580			spin_lock_bh(&dev->queue_lock);
581			qdisc_destroy(q);
582			spin_unlock_bh(&dev->queue_lock);
583		}
584	} else {
585		qdisc_notify(skb, n, clid, NULL, q);
586	}
587	return 0;
588}
589
590/*
591   Create/change qdisc.
592 */
593
594static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
595{
596	struct tcmsg *tcm = NLMSG_DATA(n);
597	struct rtattr **tca = arg;
598	struct net_device *dev;
599	u32 clid = tcm->tcm_parent;
600	struct Qdisc *q = NULL;
601	struct Qdisc *p = NULL;
602	int err;
603
604	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
605		return -ENODEV;
606
607	if (clid) {
608		if (clid != TC_H_ROOT) {
609			if (clid != TC_H_INGRESS) {
610				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
611					return -ENOENT;
612				q = qdisc_leaf(p, clid);
613			} else { /*ingress */
614				q = dev->qdisc_ingress;
615			}
616		} else {
617			q = dev->qdisc_sleeping;
618		}
619
620		/* It may be default qdisc, ignore it */
621		if (q && q->handle == 0)
622			q = NULL;
623
624		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
625			if (tcm->tcm_handle) {
626				if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
627					return -EEXIST;
628				if (TC_H_MIN(tcm->tcm_handle))
629					return -EINVAL;
630				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
631					goto create_n_graft;
632				if (n->nlmsg_flags&NLM_F_EXCL)
633					return -EEXIST;
634				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
635					return -EINVAL;
636				if (q == p ||
637				    (p && check_loop(q, p, 0)))
638					return -ELOOP;
639				atomic_inc(&q->refcnt);
640				goto graft;
641			} else {
642				if (q == NULL)
643					goto create_n_graft;
644
645				/* This magic test requires explanation.
646				 *
647				 *   We know, that some child q is already
648				 *   attached to this parent and have choice:
649				 *   either to change it or to create/graft new one.
650				 *
651				 *   1. We are allowed to create/graft only
652				 *   if CREATE and REPLACE flags are set.
653				 *
654				 *   2. If EXCL is set, requestor wanted to say,
655				 *   that qdisc tcm_handle is not expected
656				 *   to exist, so that we choose create/graft too.
657				 *
658				 *   3. The last case is when no flags are set.
659				 *   Alas, it is sort of hole in API, we
660				 *   cannot decide what to do unambiguously.
661				 *   For now we select create/graft, if
662				 *   user gave KIND, which does not match existing.
663				 */
664				if ((n->nlmsg_flags&NLM_F_CREATE) &&
665				    (n->nlmsg_flags&NLM_F_REPLACE) &&
666				    ((n->nlmsg_flags&NLM_F_EXCL) ||
667				     (tca[TCA_KIND-1] &&
668				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))))
669					goto create_n_graft;
670			}
671		}
672	} else {
673		if (!tcm->tcm_handle)
674			return -EINVAL;
675		q = qdisc_lookup(dev, tcm->tcm_handle);
676	}
677
678	/* Change qdisc parameters */
679	if (q == NULL)
680		return -ENOENT;
681	if (n->nlmsg_flags&NLM_F_EXCL)
682		return -EEXIST;
683	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id))
684		return -EINVAL;
685	err = qdisc_change(q, tca);
686	if (err == 0)
687		qdisc_notify(skb, n, clid, NULL, q);
688	return err;
689
690create_n_graft:
691	if (!(n->nlmsg_flags&NLM_F_CREATE))
692		return -ENOENT;
693	if (clid == TC_H_INGRESS)
694		q = qdisc_create(dev, tcm->tcm_parent, tca, &err);
695        else
696		q = qdisc_create(dev, tcm->tcm_handle, tca, &err);
697	if (q == NULL)
698		return err;
699
700graft:
701	if (1) {
702		struct Qdisc *old_q = NULL;
703		err = qdisc_graft(dev, p, clid, q, &old_q);
704		if (err) {
705			if (q) {
706				spin_lock_bh(&dev->queue_lock);
707				qdisc_destroy(q);
708				spin_unlock_bh(&dev->queue_lock);
709			}
710			return err;
711		}
712		qdisc_notify(skb, n, clid, old_q, q);
713		if (old_q) {
714			spin_lock_bh(&dev->queue_lock);
715			qdisc_destroy(old_q);
716			spin_unlock_bh(&dev->queue_lock);
717		}
718	}
719	return 0;
720}
721
722int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st)
723{
724	spin_lock_bh(st->lock);
725	RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st);
726	spin_unlock_bh(st->lock);
727	return 0;
728
729rtattr_failure:
730	spin_unlock_bh(st->lock);
731	return -1;
732}
733
734
735static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
736			 u32 pid, u32 seq, unsigned flags, int event)
737{
738	struct tcmsg *tcm;
739	struct nlmsghdr  *nlh;
740	unsigned char	 *b = skb->tail;
741
742	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
743	nlh->nlmsg_flags = flags;
744	tcm = NLMSG_DATA(nlh);
745	tcm->tcm_family = AF_UNSPEC;
746	tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
747	tcm->tcm_parent = clid;
748	tcm->tcm_handle = q->handle;
749	tcm->tcm_info = atomic_read(&q->refcnt);
750	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
751	if (q->ops->dump && q->ops->dump(q, skb) < 0)
752		goto rtattr_failure;
753	q->stats.qlen = q->q.qlen;
754	if (qdisc_copy_stats(skb, &q->stats))
755		goto rtattr_failure;
756	nlh->nlmsg_len = skb->tail - b;
757	return skb->len;
758
759nlmsg_failure:
760rtattr_failure:
761	skb_trim(skb, b - skb->data);
762	return -1;
763}
764
765static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n,
766			u32 clid, struct Qdisc *old, struct Qdisc *new)
767{
768	struct sk_buff *skb;
769	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
770
771	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
772	if (!skb)
773		return -ENOBUFS;
774
775	if (old && old->handle) {
776		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
777			goto err_out;
778	}
779	if (new) {
780		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
781			goto err_out;
782	}
783
784	if (skb->len)
785		return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
786
787err_out:
788	kfree_skb(skb);
789	return -EINVAL;
790}
791
792static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
793{
794	int idx, q_idx;
795	int s_idx, s_q_idx;
796	struct net_device *dev;
797	struct Qdisc *q;
798
799	s_idx = cb->args[0];
800	s_q_idx = q_idx = cb->args[1];
801	read_lock(&dev_base_lock);
802	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) {
803		if (idx < s_idx)
804			continue;
805		if (idx > s_idx)
806			s_q_idx = 0;
807		read_lock(&qdisc_tree_lock);
808		for (q = dev->qdisc_list, q_idx = 0; q;
809		     q = q->next, q_idx++) {
810			if (q_idx < s_q_idx)
811				continue;
812			if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid,
813					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) {
814				read_unlock(&qdisc_tree_lock);
815				goto done;
816			}
817		}
818		read_unlock(&qdisc_tree_lock);
819	}
820
821done:
822	read_unlock(&dev_base_lock);
823
824	cb->args[0] = idx;
825	cb->args[1] = q_idx;
826
827	return skb->len;
828}
829
830
831
832/************************************************
833 *	Traffic classes manipulation.		*
834 ************************************************/
835
836
837
838static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
839{
840	struct tcmsg *tcm = NLMSG_DATA(n);
841	struct rtattr **tca = arg;
842	struct net_device *dev;
843	struct Qdisc *q = NULL;
844	struct Qdisc_class_ops *cops;
845	unsigned long cl = 0;
846	unsigned long new_cl;
847	u32 pid = tcm->tcm_parent;
848	u32 clid = tcm->tcm_handle;
849	u32 qid = TC_H_MAJ(clid);
850	int err;
851
852	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL)
853		return -ENODEV;
854
855	/*
856	   parent == TC_H_UNSPEC - unspecified parent.
857	   parent == TC_H_ROOT   - class is root, which has no parent.
858	   parent == X:0	 - parent is root class.
859	   parent == X:Y	 - parent is a node in hierarchy.
860	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc.
861
862	   handle == 0:0	 - generate handle from kernel pool.
863	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc.
864	   handle == X:Y	 - clear.
865	   handle == X:0	 - root class.
866	 */
867
868	/* Step 1. Determine qdisc handle X:0 */
869
870	if (pid != TC_H_ROOT) {
871		u32 qid1 = TC_H_MAJ(pid);
872
873		if (qid && qid1) {
874			/* If both majors are known, they must be identical. */
875			if (qid != qid1)
876				return -EINVAL;
877		} else if (qid1) {
878			qid = qid1;
879		} else if (qid == 0)
880			qid = dev->qdisc_sleeping->handle;
881
882		/* Now qid is genuine qdisc handle consistent
883		   both with parent and child.
884
885		   TC_H_MAJ(pid) still may be unspecified, complete it now.
886		 */
887		if (pid)
888			pid = TC_H_MAKE(qid, pid);
889	} else {
890		if (qid == 0)
891			qid = dev->qdisc_sleeping->handle;
892	}
893
894	/* OK. Locate qdisc */
895	if ((q = qdisc_lookup(dev, qid)) == NULL)
896		return -ENOENT;
897
898	/* An check that it supports classes */
899	cops = q->ops->cl_ops;
900	if (cops == NULL)
901		return -EINVAL;
902
903	/* Now try to get class */
904	if (clid == 0) {
905		if (pid == TC_H_ROOT)
906			clid = qid;
907	} else
908		clid = TC_H_MAKE(qid, clid);
909
910	if (clid)
911		cl = cops->get(q, clid);
912
913	if (cl == 0) {
914		err = -ENOENT;
915		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
916			goto out;
917	} else {
918		switch (n->nlmsg_type) {
919		case RTM_NEWTCLASS:
920			err = -EEXIST;
921			if (n->nlmsg_flags&NLM_F_EXCL)
922				goto out;
923			break;
924		case RTM_DELTCLASS:
925			err = cops->delete(q, cl);
926			if (err == 0)
927				tclass_notify(skb, n, q, cl, RTM_DELTCLASS);
928			goto out;
929		case RTM_GETTCLASS:
930			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS);
931			goto out;
932		default:
933			err = -EINVAL;
934			goto out;
935		}
936	}
937
938	new_cl = cl;
939	err = cops->change(q, clid, pid, tca, &new_cl);
940	if (err == 0)
941		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS);
942
943out:
944	if (cl)
945		cops->put(q, cl);
946
947	return err;
948}
949
950
951static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
952			  unsigned long cl,
953			  u32 pid, u32 seq, unsigned flags, int event)
954{
955	struct tcmsg *tcm;
956	struct nlmsghdr  *nlh;
957	unsigned char	 *b = skb->tail;
958
959	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm));
960	nlh->nlmsg_flags = flags;
961	tcm = NLMSG_DATA(nlh);
962	tcm->tcm_family = AF_UNSPEC;
963	tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0;
964	tcm->tcm_parent = q->handle;
965	tcm->tcm_handle = q->handle;
966	tcm->tcm_info = 0;
967	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id);
968	if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0)
969		goto rtattr_failure;
970	nlh->nlmsg_len = skb->tail - b;
971	return skb->len;
972
973nlmsg_failure:
974rtattr_failure:
975	skb_trim(skb, b - skb->data);
976	return -1;
977}
978
979static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n,
980			  struct Qdisc *q, unsigned long cl, int event)
981{
982	struct sk_buff *skb;
983	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
984
985	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
986	if (!skb)
987		return -ENOBUFS;
988
989	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
990		kfree_skb(skb);
991		return -EINVAL;
992	}
993
994	return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
995}
996
997struct qdisc_dump_args
998{
999	struct qdisc_walker w;
1000	struct sk_buff *skb;
1001	struct netlink_callback *cb;
1002};
1003
1004static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1005{
1006	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1007
1008	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1009			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1010}
1011
1012static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1013{
1014	int t;
1015	int s_t;
1016	struct net_device *dev;
1017	struct Qdisc *q;
1018	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1019	struct qdisc_dump_args arg;
1020
1021	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1022		return 0;
1023	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL)
1024		return 0;
1025
1026	s_t = cb->args[0];
1027
1028	read_lock(&qdisc_tree_lock);
1029	for (q=dev->qdisc_list, t=0; q; q = q->next, t++) {
1030		if (t < s_t) continue;
1031		if (!q->ops->cl_ops) continue;
1032		if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)
1033			continue;
1034		if (t > s_t)
1035			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1036		arg.w.fn = qdisc_class_dump;
1037		arg.skb = skb;
1038		arg.cb = cb;
1039		arg.w.stop  = 0;
1040		arg.w.skip = cb->args[1];
1041		arg.w.count = 0;
1042		q->ops->cl_ops->walk(q, &arg.w);
1043		cb->args[1] = arg.w.count;
1044		if (arg.w.stop)
1045			break;
1046	}
1047	read_unlock(&qdisc_tree_lock);
1048
1049	cb->args[0] = t;
1050
1051	dev_put(dev);
1052	return skb->len;
1053}
1054
1055int psched_us_per_tick = 1;
1056int psched_tick_per_us = 1;
1057
1058#ifdef CONFIG_PROC_FS
1059static int psched_read_proc(char *buffer, char **start, off_t offset,
1060			     int length, int *eof, void *data)
1061{
1062	int len;
1063
1064	len = sprintf(buffer, "%08x %08x %08x %08x\n",
1065		      psched_tick_per_us, psched_us_per_tick,
1066		      1000000, HZ);
1067
1068	len -= offset;
1069
1070	if (len > length)
1071		len = length;
1072	if(len < 0)
1073		len = 0;
1074
1075	*start = buffer + offset;
1076	*eof = 1;
1077
1078	return len;
1079}
1080#endif
1081
1082#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY
1083int psched_tod_diff(int delta_sec, int bound)
1084{
1085	int delta;
1086
1087	if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1)
1088		return bound;
1089	delta = delta_sec * 1000000;
1090	if (delta > bound)
1091		delta = bound;
1092	return delta;
1093}
1094#endif
1095
1096psched_time_t psched_time_base;
1097
1098#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1099psched_tdiff_t psched_clock_per_hz;
1100int psched_clock_scale;
1101#endif
1102
1103#ifdef PSCHED_WATCHER
1104PSCHED_WATCHER psched_time_mark;
1105
1106static void psched_tick(unsigned long);
1107
1108static struct timer_list psched_timer =
1109	{ function: psched_tick };
1110
1111static void psched_tick(unsigned long dummy)
1112{
1113#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1114	psched_time_t dummy_stamp;
1115	PSCHED_GET_TIME(dummy_stamp);
1116	/* It is OK up to 4GHz cpu */
1117	psched_timer.expires = jiffies + 1*HZ;
1118#else
1119	unsigned long now = jiffies;
1120	psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE;
1121	psched_time_mark = now;
1122	psched_timer.expires = now + 60*60*HZ;
1123#endif
1124	add_timer(&psched_timer);
1125}
1126#endif
1127
1128#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1129int __init psched_calibrate_clock(void)
1130{
1131	psched_time_t stamp, stamp1;
1132	struct timeval tv, tv1;
1133	psched_tdiff_t delay;
1134	long rdelay;
1135	unsigned long stop;
1136
1137#ifdef PSCHED_WATCHER
1138	psched_tick(0);
1139#endif
1140	stop = jiffies + HZ/10;
1141	PSCHED_GET_TIME(stamp);
1142	do_gettimeofday(&tv);
1143	while (time_before(jiffies, stop)) {
1144		barrier();
1145		cpu_relax();
1146	}
1147	PSCHED_GET_TIME(stamp1);
1148	do_gettimeofday(&tv1);
1149
1150	delay = PSCHED_TDIFF(stamp1, stamp);
1151	rdelay = tv1.tv_usec - tv.tv_usec;
1152	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000;
1153	if (rdelay > delay)
1154		return -1;
1155	delay /= rdelay;
1156	psched_tick_per_us = delay;
1157	while ((delay>>=1) != 0)
1158		psched_clock_scale++;
1159	psched_us_per_tick = 1<<psched_clock_scale;
1160	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale;
1161	return 0;
1162}
1163#endif
1164
1165int __init pktsched_init(void)
1166{
1167	struct rtnetlink_link *link_p;
1168
1169#if PSCHED_CLOCK_SOURCE == PSCHED_CPU
1170	if (psched_calibrate_clock() < 0)
1171		return -1;
1172#elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES
1173	psched_tick_per_us = HZ<<PSCHED_JSCALE;
1174	psched_us_per_tick = 1000000;
1175#ifdef PSCHED_WATCHER
1176	psched_tick(0);
1177#endif
1178#endif
1179
1180	link_p = rtnetlink_links[PF_UNSPEC];
1181
1182	/* Setup rtnetlink links. It is made here to avoid
1183	   exporting large number of public symbols.
1184	 */
1185
1186	if (link_p) {
1187		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc;
1188		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc;
1189		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc;
1190		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc;
1191		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1192		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1193		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass;
1194		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass;
1195	}
1196
1197#define INIT_QDISC(name) { \
1198          extern struct Qdisc_ops name##_qdisc_ops; \
1199          register_qdisc(& name##_qdisc_ops);       \
1200	}
1201
1202	INIT_QDISC(pfifo);
1203	INIT_QDISC(bfifo);
1204
1205#ifdef CONFIG_NET_SCH_CBQ
1206	INIT_QDISC(cbq);
1207#endif
1208#ifdef CONFIG_NET_SCH_HTB
1209	INIT_QDISC(htb);
1210#endif
1211#ifdef CONFIG_NET_SCH_CSZ
1212	INIT_QDISC(csz);
1213#endif
1214#ifdef CONFIG_NET_SCH_HPFQ
1215	INIT_QDISC(hpfq);
1216#endif
1217#ifdef CONFIG_NET_SCH_HFSC
1218	INIT_QDISC(hfsc);
1219#endif
1220#ifdef CONFIG_NET_SCH_RED
1221	INIT_QDISC(red);
1222#endif
1223#ifdef CONFIG_NET_SCH_GRED
1224       INIT_QDISC(gred);
1225#endif
1226#ifdef CONFIG_NET_SCH_INGRESS
1227       INIT_QDISC(ingress);
1228#endif
1229#ifdef CONFIG_NET_SCH_DSMARK
1230       INIT_QDISC(dsmark);
1231#endif
1232#ifdef CONFIG_NET_SCH_SFQ
1233	INIT_QDISC(sfq);
1234#endif
1235#ifdef CONFIG_NET_SCH_TBF
1236	INIT_QDISC(tbf);
1237#endif
1238#ifdef CONFIG_NET_SCH_TEQL
1239	teql_init();
1240#endif
1241#ifdef CONFIG_NET_SCH_PRIO
1242	INIT_QDISC(prio);
1243#endif
1244#ifdef CONFIG_NET_SCH_ATM
1245	INIT_QDISC(atm);
1246#endif
1247#ifdef CONFIG_NET_CLS
1248	tc_filter_init();
1249#endif
1250
1251#ifdef CONFIG_PROC_FS
1252	create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL);
1253#endif
1254
1255	return 0;
1256}
1257