1/* 2 * net/sched/sch_api.c Packet scheduler API. 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of the GNU General Public License 6 * as published by the Free Software Foundation; either version 7 * 2 of the License, or (at your option) any later version. 8 * 9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 10 * 11 * Fixes: 12 * 13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. 14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support 15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support 16 */ 17 18#include <linux/config.h> 19#include <linux/types.h> 20#include <linux/kernel.h> 21#include <linux/sched.h> 22#include <linux/string.h> 23#include <linux/mm.h> 24#include <linux/socket.h> 25#include <linux/sockios.h> 26#include <linux/in.h> 27#include <linux/errno.h> 28#include <linux/interrupt.h> 29#include <linux/netdevice.h> 30#include <linux/skbuff.h> 31#include <linux/rtnetlink.h> 32#include <linux/init.h> 33#include <linux/proc_fs.h> 34#include <linux/kmod.h> 35 36#include <net/sock.h> 37#include <net/pkt_sched.h> 38 39#include <asm/processor.h> 40#include <asm/uaccess.h> 41#include <asm/system.h> 42#include <asm/bitops.h> 43 44static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, 45 struct Qdisc *old, struct Qdisc *new); 46static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, 47 struct Qdisc *q, unsigned long cl, int event); 48 49/* 50 51 Short review. 52 ------------- 53 54 This file consists of two interrelated parts: 55 56 1. queueing disciplines manager frontend. 57 2. traffic classes manager frontend. 58 59 Generally, queueing discipline ("qdisc") is a black box, 60 which is able to enqueue packets and to dequeue them (when 61 device is ready to send something) in order and at times 62 determined by algorithm hidden in it. 63 64 qdisc's are divided to two categories: 65 - "queues", which have no internal structure visible from outside. 66 - "schedulers", which split all the packets to "traffic classes", 67 using "packet classifiers" (look at cls_api.c) 68 69 In turn, classes may have child qdiscs (as rule, queues) 70 attached to them etc. etc. etc. 71 72 The goal of the routines in this file is to translate 73 information supplied by user in the form of handles 74 to more intelligible for kernel form, to make some sanity 75 checks and part of work, which is common to all qdiscs 76 and to provide rtnetlink notifications. 77 78 All real intelligent work is done inside qdisc modules. 79 80 81 82 Every discipline has two major routines: enqueue and dequeue. 83 84 ---dequeue 85 86 dequeue usually returns a skb to send. It is allowed to return NULL, 87 but it does not mean that queue is empty, it just means that 88 discipline does not want to send anything this time. 89 Queue is really empty if q->q.qlen == 0. 90 For complicated disciplines with multiple queues q->q is not 91 real packet queue, but however q->q.qlen must be valid. 92 93 ---enqueue 94 95 enqueue returns 0, if packet was enqueued successfully. 96 If packet (this one or another one) was dropped, it returns 97 not zero error code. 98 NET_XMIT_DROP - this packet dropped 99 Expected action: do not backoff, but wait until queue will clear. 100 NET_XMIT_CN - probably this packet enqueued, but another one dropped. 101 Expected action: backoff or ignore 102 NET_XMIT_POLICED - dropped by police. 103 Expected action: backoff or error to real-time apps. 104 105 Auxiliary routines: 106 107 ---requeue 108 109 requeues once dequeued packet. It is used for non-standard or 110 just buggy devices, which can defer output even if dev->tbusy=0. 111 112 ---reset 113 114 returns qdisc to initial state: purge all buffers, clear all 115 timers, counters (except for statistics) etc. 116 117 ---init 118 119 initializes newly created qdisc. 120 121 ---destroy 122 123 destroys resources allocated by init and during lifetime of qdisc. 124 125 ---change 126 127 changes qdisc parameters. 128 */ 129 130/* Protects list of registered TC modules. It is pure SMP lock. */ 131static rwlock_t qdisc_mod_lock = RW_LOCK_UNLOCKED; 132 133 134/************************************************ 135 * Queueing disciplines manipulation. * 136 ************************************************/ 137 138 139/* The list of all installed queueing disciplines. */ 140 141static struct Qdisc_ops *qdisc_base = NULL; 142 143/* Register/uregister queueing discipline */ 144 145int register_qdisc(struct Qdisc_ops *qops) 146{ 147 struct Qdisc_ops *q, **qp; 148 149 write_lock(&qdisc_mod_lock); 150 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) { 151 if (strcmp(qops->id, q->id) == 0) { 152 write_unlock(&qdisc_mod_lock); 153 return -EEXIST; 154 } 155 } 156 157 if (qops->enqueue == NULL) 158 qops->enqueue = noop_qdisc_ops.enqueue; 159 if (qops->requeue == NULL) 160 qops->requeue = noop_qdisc_ops.requeue; 161 if (qops->dequeue == NULL) 162 qops->dequeue = noop_qdisc_ops.dequeue; 163 164 qops->next = NULL; 165 *qp = qops; 166 write_unlock(&qdisc_mod_lock); 167 return 0; 168} 169 170int unregister_qdisc(struct Qdisc_ops *qops) 171{ 172 struct Qdisc_ops *q, **qp; 173 int err = -ENOENT; 174 175 write_lock(&qdisc_mod_lock); 176 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) 177 if (q == qops) 178 break; 179 if (q) { 180 *qp = q->next; 181 q->next = NULL; 182 err = 0; 183 } 184 write_unlock(&qdisc_mod_lock); 185 return err; 186} 187 188/* We know handle. Find qdisc among all qdisc's attached to device 189 (root qdisc, all its children, children of children etc.) 190 */ 191 192struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) 193{ 194 struct Qdisc *q; 195 196 for (q = dev->qdisc_list; q; q = q->next) { 197 if (q->handle == handle) 198 return q; 199 } 200 return NULL; 201} 202 203struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) 204{ 205 unsigned long cl; 206 struct Qdisc *leaf; 207 struct Qdisc_class_ops *cops = p->ops->cl_ops; 208 209 if (cops == NULL) 210 return NULL; 211 cl = cops->get(p, classid); 212 213 if (cl == 0) 214 return NULL; 215 leaf = cops->leaf(p, cl); 216 cops->put(p, cl); 217 return leaf; 218} 219 220/* Find queueing discipline by name */ 221 222struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) 223{ 224 struct Qdisc_ops *q = NULL; 225 226 if (kind) { 227 read_lock(&qdisc_mod_lock); 228 for (q = qdisc_base; q; q = q->next) { 229 if (rtattr_strcmp(kind, q->id) == 0) 230 break; 231 } 232 read_unlock(&qdisc_mod_lock); 233 } 234 return q; 235} 236 237static struct qdisc_rate_table *qdisc_rtab_list; 238 239struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) 240{ 241 struct qdisc_rate_table *rtab; 242 243 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { 244 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { 245 rtab->refcnt++; 246 return rtab; 247 } 248 } 249 250 if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) 251 return NULL; 252 253 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); 254 if (rtab) { 255 rtab->rate = *r; 256 rtab->refcnt = 1; 257 memcpy(rtab->data, RTA_DATA(tab), 1024); 258 rtab->next = qdisc_rtab_list; 259 qdisc_rtab_list = rtab; 260 } 261 return rtab; 262} 263 264void qdisc_put_rtab(struct qdisc_rate_table *tab) 265{ 266 struct qdisc_rate_table *rtab, **rtabp; 267 268 if (!tab || --tab->refcnt) 269 return; 270 271 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { 272 if (rtab == tab) { 273 *rtabp = rtab->next; 274 kfree(rtab); 275 return; 276 } 277 } 278} 279 280 281/* Allocate an unique handle from space managed by kernel */ 282 283u32 qdisc_alloc_handle(struct net_device *dev) 284{ 285 int i = 0x10000; 286 static u32 autohandle = TC_H_MAKE(0x80000000U, 0); 287 288 do { 289 autohandle += TC_H_MAKE(0x10000U, 0); 290 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) 291 autohandle = TC_H_MAKE(0x80000000U, 0); 292 } while (qdisc_lookup(dev, autohandle) && --i > 0); 293 294 return i>0 ? autohandle : 0; 295} 296 297/* Attach toplevel qdisc to device dev */ 298 299static struct Qdisc * 300dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) 301{ 302 struct Qdisc *oqdisc; 303 304 if (dev->flags & IFF_UP) 305 dev_deactivate(dev); 306 307 write_lock(&qdisc_tree_lock); 308 spin_lock_bh(&dev->queue_lock); 309 if (qdisc && qdisc->flags&TCQ_F_INGRES) { 310 oqdisc = dev->qdisc_ingress; 311 /* Prune old scheduler */ 312 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { 313 /* delete */ 314 qdisc_reset(oqdisc); 315 dev->qdisc_ingress = NULL; 316 } else { /* new */ 317 dev->qdisc_ingress = qdisc; 318 } 319 320 } else { 321 322 oqdisc = dev->qdisc_sleeping; 323 324 /* Prune old scheduler */ 325 if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) 326 qdisc_reset(oqdisc); 327 328 /* ... and graft new one */ 329 if (qdisc == NULL) 330 qdisc = &noop_qdisc; 331 dev->qdisc_sleeping = qdisc; 332 dev->qdisc = &noop_qdisc; 333 } 334 335 spin_unlock_bh(&dev->queue_lock); 336 write_unlock(&qdisc_tree_lock); 337 338 if (dev->flags & IFF_UP) 339 dev_activate(dev); 340 341 return oqdisc; 342} 343 344 345/* Graft qdisc "new" to class "classid" of qdisc "parent" or 346 to device "dev". 347 348 Old qdisc is not destroyed but returned in *old. 349 */ 350 351int qdisc_graft(struct net_device *dev, struct Qdisc *parent, u32 classid, 352 struct Qdisc *new, struct Qdisc **old) 353{ 354 int err = 0; 355 struct Qdisc *q = *old; 356 357 358 if (parent == NULL) { 359 if (q && q->flags&TCQ_F_INGRES) { 360 *old = dev_graft_qdisc(dev, q); 361 } else { 362 *old = dev_graft_qdisc(dev, new); 363 } 364 } else { 365 struct Qdisc_class_ops *cops = parent->ops->cl_ops; 366 367 err = -EINVAL; 368 369 if (cops) { 370 unsigned long cl = cops->get(parent, classid); 371 if (cl) { 372 err = cops->graft(parent, cl, new, old); 373 cops->put(parent, cl); 374 } 375 } 376 } 377 return err; 378} 379 380/* 381 Allocate and initialize new qdisc. 382 383 Parameters are passed via opt. 384 */ 385 386static struct Qdisc * 387qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) 388{ 389 int err; 390 struct rtattr *kind = tca[TCA_KIND-1]; 391 struct Qdisc *sch = NULL; 392 struct Qdisc_ops *ops; 393 int size; 394 395 ops = qdisc_lookup_ops(kind); 396#ifdef CONFIG_KMOD 397 if (ops==NULL && tca[TCA_KIND-1] != NULL) { 398 char module_name[4 + IFNAMSIZ + 1]; 399 400 if (RTA_PAYLOAD(kind) <= IFNAMSIZ) { 401 sprintf(module_name, "sch_%s", (char*)RTA_DATA(kind)); 402 request_module (module_name); 403 ops = qdisc_lookup_ops(kind); 404 } 405 } 406#endif 407 408 err = -EINVAL; 409 if (ops == NULL) 410 goto err_out; 411 412 size = sizeof(*sch) + ops->priv_size; 413 414 sch = kmalloc(size, GFP_KERNEL); 415 err = -ENOBUFS; 416 if (!sch) 417 goto err_out; 418 419 /* Grrr... Resolve race condition with module unload */ 420 421 err = -EINVAL; 422 if (ops != qdisc_lookup_ops(kind)) 423 goto err_out; 424 425 memset(sch, 0, size); 426 427 skb_queue_head_init(&sch->q); 428 429 if (handle == TC_H_INGRESS) 430 sch->flags |= TCQ_F_INGRES; 431 432 sch->ops = ops; 433 sch->enqueue = ops->enqueue; 434 sch->dequeue = ops->dequeue; 435 sch->dev = dev; 436 atomic_set(&sch->refcnt, 1); 437 sch->stats.lock = &dev->queue_lock; 438 if (handle == 0) { 439 handle = qdisc_alloc_handle(dev); 440 err = -ENOMEM; 441 if (handle == 0) 442 goto err_out; 443 } 444 445 if (handle == TC_H_INGRESS) 446 sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); 447 else 448 sch->handle = handle; 449 450 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { 451 write_lock(&qdisc_tree_lock); 452 sch->next = dev->qdisc_list; 453 dev->qdisc_list = sch; 454 write_unlock(&qdisc_tree_lock); 455#ifdef CONFIG_NET_ESTIMATOR 456 if (tca[TCA_RATE-1]) 457 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); 458#endif 459 return sch; 460 } 461 462err_out: 463 *errp = err; 464 if (sch) 465 kfree(sch); 466 return NULL; 467} 468 469static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) 470{ 471 if (tca[TCA_OPTIONS-1]) { 472 int err; 473 474 if (sch->ops->change == NULL) 475 return -EINVAL; 476 err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); 477 if (err) 478 return err; 479 } 480#ifdef CONFIG_NET_ESTIMATOR 481 if (tca[TCA_RATE-1]) { 482 qdisc_kill_estimator(&sch->stats); 483 qdisc_new_estimator(&sch->stats, tca[TCA_RATE-1]); 484 } 485#endif 486 return 0; 487} 488 489struct check_loop_arg 490{ 491 struct qdisc_walker w; 492 struct Qdisc *p; 493 int depth; 494}; 495 496static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); 497 498static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) 499{ 500 struct check_loop_arg arg; 501 502 if (q->ops->cl_ops == NULL) 503 return 0; 504 505 arg.w.stop = arg.w.skip = arg.w.count = 0; 506 arg.w.fn = check_loop_fn; 507 arg.depth = depth; 508 arg.p = p; 509 q->ops->cl_ops->walk(q, &arg.w); 510 return arg.w.stop ? -ELOOP : 0; 511} 512 513static int 514check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) 515{ 516 struct Qdisc *leaf; 517 struct Qdisc_class_ops *cops = q->ops->cl_ops; 518 struct check_loop_arg *arg = (struct check_loop_arg *)w; 519 520 leaf = cops->leaf(q, cl); 521 if (leaf) { 522 if (leaf == arg->p || arg->depth > 7) 523 return -ELOOP; 524 return check_loop(leaf, arg->p, arg->depth + 1); 525 } 526 return 0; 527} 528 529/* 530 * Delete/get qdisc. 531 */ 532 533static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 534{ 535 struct tcmsg *tcm = NLMSG_DATA(n); 536 struct rtattr **tca = arg; 537 struct net_device *dev; 538 u32 clid = tcm->tcm_parent; 539 struct Qdisc *q = NULL; 540 struct Qdisc *p = NULL; 541 int err; 542 543 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 544 return -ENODEV; 545 546 if (clid) { 547 if (clid != TC_H_ROOT) { 548 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { 549 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 550 return -ENOENT; 551 q = qdisc_leaf(p, clid); 552 } else { /* ingress */ 553 q = dev->qdisc_ingress; 554 } 555 } else { 556 q = dev->qdisc_sleeping; 557 } 558 if (!q) 559 return -ENOENT; 560 561 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) 562 return -EINVAL; 563 } else { 564 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 565 return -ENOENT; 566 } 567 568 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 569 return -EINVAL; 570 571 if (n->nlmsg_type == RTM_DELQDISC) { 572 if (!clid) 573 return -EINVAL; 574 if (q->handle == 0) 575 return -ENOENT; 576 if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) 577 return err; 578 if (q) { 579 qdisc_notify(skb, n, clid, q, NULL); 580 spin_lock_bh(&dev->queue_lock); 581 qdisc_destroy(q); 582 spin_unlock_bh(&dev->queue_lock); 583 } 584 } else { 585 qdisc_notify(skb, n, clid, NULL, q); 586 } 587 return 0; 588} 589 590/* 591 Create/change qdisc. 592 */ 593 594static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 595{ 596 struct tcmsg *tcm = NLMSG_DATA(n); 597 struct rtattr **tca = arg; 598 struct net_device *dev; 599 u32 clid = tcm->tcm_parent; 600 struct Qdisc *q = NULL; 601 struct Qdisc *p = NULL; 602 int err; 603 604 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 605 return -ENODEV; 606 607 if (clid) { 608 if (clid != TC_H_ROOT) { 609 if (clid != TC_H_INGRESS) { 610 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) 611 return -ENOENT; 612 q = qdisc_leaf(p, clid); 613 } else { /*ingress */ 614 q = dev->qdisc_ingress; 615 } 616 } else { 617 q = dev->qdisc_sleeping; 618 } 619 620 /* It may be default qdisc, ignore it */ 621 if (q && q->handle == 0) 622 q = NULL; 623 624 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { 625 if (tcm->tcm_handle) { 626 if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) 627 return -EEXIST; 628 if (TC_H_MIN(tcm->tcm_handle)) 629 return -EINVAL; 630 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) 631 goto create_n_graft; 632 if (n->nlmsg_flags&NLM_F_EXCL) 633 return -EEXIST; 634 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 635 return -EINVAL; 636 if (q == p || 637 (p && check_loop(q, p, 0))) 638 return -ELOOP; 639 atomic_inc(&q->refcnt); 640 goto graft; 641 } else { 642 if (q == NULL) 643 goto create_n_graft; 644 645 /* This magic test requires explanation. 646 * 647 * We know, that some child q is already 648 * attached to this parent and have choice: 649 * either to change it or to create/graft new one. 650 * 651 * 1. We are allowed to create/graft only 652 * if CREATE and REPLACE flags are set. 653 * 654 * 2. If EXCL is set, requestor wanted to say, 655 * that qdisc tcm_handle is not expected 656 * to exist, so that we choose create/graft too. 657 * 658 * 3. The last case is when no flags are set. 659 * Alas, it is sort of hole in API, we 660 * cannot decide what to do unambiguously. 661 * For now we select create/graft, if 662 * user gave KIND, which does not match existing. 663 */ 664 if ((n->nlmsg_flags&NLM_F_CREATE) && 665 (n->nlmsg_flags&NLM_F_REPLACE) && 666 ((n->nlmsg_flags&NLM_F_EXCL) || 667 (tca[TCA_KIND-1] && 668 rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) 669 goto create_n_graft; 670 } 671 } 672 } else { 673 if (!tcm->tcm_handle) 674 return -EINVAL; 675 q = qdisc_lookup(dev, tcm->tcm_handle); 676 } 677 678 /* Change qdisc parameters */ 679 if (q == NULL) 680 return -ENOENT; 681 if (n->nlmsg_flags&NLM_F_EXCL) 682 return -EEXIST; 683 if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) 684 return -EINVAL; 685 err = qdisc_change(q, tca); 686 if (err == 0) 687 qdisc_notify(skb, n, clid, NULL, q); 688 return err; 689 690create_n_graft: 691 if (!(n->nlmsg_flags&NLM_F_CREATE)) 692 return -ENOENT; 693 if (clid == TC_H_INGRESS) 694 q = qdisc_create(dev, tcm->tcm_parent, tca, &err); 695 else 696 q = qdisc_create(dev, tcm->tcm_handle, tca, &err); 697 if (q == NULL) 698 return err; 699 700graft: 701 if (1) { 702 struct Qdisc *old_q = NULL; 703 err = qdisc_graft(dev, p, clid, q, &old_q); 704 if (err) { 705 if (q) { 706 spin_lock_bh(&dev->queue_lock); 707 qdisc_destroy(q); 708 spin_unlock_bh(&dev->queue_lock); 709 } 710 return err; 711 } 712 qdisc_notify(skb, n, clid, old_q, q); 713 if (old_q) { 714 spin_lock_bh(&dev->queue_lock); 715 qdisc_destroy(old_q); 716 spin_unlock_bh(&dev->queue_lock); 717 } 718 } 719 return 0; 720} 721 722int qdisc_copy_stats(struct sk_buff *skb, struct tc_stats *st) 723{ 724 spin_lock_bh(st->lock); 725 RTA_PUT(skb, TCA_STATS, (char*)&st->lock - (char*)st, st); 726 spin_unlock_bh(st->lock); 727 return 0; 728 729rtattr_failure: 730 spin_unlock_bh(st->lock); 731 return -1; 732} 733 734 735static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, 736 u32 pid, u32 seq, unsigned flags, int event) 737{ 738 struct tcmsg *tcm; 739 struct nlmsghdr *nlh; 740 unsigned char *b = skb->tail; 741 742 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 743 nlh->nlmsg_flags = flags; 744 tcm = NLMSG_DATA(nlh); 745 tcm->tcm_family = AF_UNSPEC; 746 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; 747 tcm->tcm_parent = clid; 748 tcm->tcm_handle = q->handle; 749 tcm->tcm_info = atomic_read(&q->refcnt); 750 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); 751 if (q->ops->dump && q->ops->dump(q, skb) < 0) 752 goto rtattr_failure; 753 q->stats.qlen = q->q.qlen; 754 if (qdisc_copy_stats(skb, &q->stats)) 755 goto rtattr_failure; 756 nlh->nlmsg_len = skb->tail - b; 757 return skb->len; 758 759nlmsg_failure: 760rtattr_failure: 761 skb_trim(skb, b - skb->data); 762 return -1; 763} 764 765static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, 766 u32 clid, struct Qdisc *old, struct Qdisc *new) 767{ 768 struct sk_buff *skb; 769 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; 770 771 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 772 if (!skb) 773 return -ENOBUFS; 774 775 if (old && old->handle) { 776 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) 777 goto err_out; 778 } 779 if (new) { 780 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) 781 goto err_out; 782 } 783 784 if (skb->len) 785 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 786 787err_out: 788 kfree_skb(skb); 789 return -EINVAL; 790} 791 792static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) 793{ 794 int idx, q_idx; 795 int s_idx, s_q_idx; 796 struct net_device *dev; 797 struct Qdisc *q; 798 799 s_idx = cb->args[0]; 800 s_q_idx = q_idx = cb->args[1]; 801 read_lock(&dev_base_lock); 802 for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { 803 if (idx < s_idx) 804 continue; 805 if (idx > s_idx) 806 s_q_idx = 0; 807 read_lock(&qdisc_tree_lock); 808 for (q = dev->qdisc_list, q_idx = 0; q; 809 q = q->next, q_idx++) { 810 if (q_idx < s_q_idx) 811 continue; 812 if (tc_fill_qdisc(skb, q, 0, NETLINK_CB(cb->skb).pid, 813 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { 814 read_unlock(&qdisc_tree_lock); 815 goto done; 816 } 817 } 818 read_unlock(&qdisc_tree_lock); 819 } 820 821done: 822 read_unlock(&dev_base_lock); 823 824 cb->args[0] = idx; 825 cb->args[1] = q_idx; 826 827 return skb->len; 828} 829 830 831 832/************************************************ 833 * Traffic classes manipulation. * 834 ************************************************/ 835 836 837 838static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) 839{ 840 struct tcmsg *tcm = NLMSG_DATA(n); 841 struct rtattr **tca = arg; 842 struct net_device *dev; 843 struct Qdisc *q = NULL; 844 struct Qdisc_class_ops *cops; 845 unsigned long cl = 0; 846 unsigned long new_cl; 847 u32 pid = tcm->tcm_parent; 848 u32 clid = tcm->tcm_handle; 849 u32 qid = TC_H_MAJ(clid); 850 int err; 851 852 if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) 853 return -ENODEV; 854 855 /* 856 parent == TC_H_UNSPEC - unspecified parent. 857 parent == TC_H_ROOT - class is root, which has no parent. 858 parent == X:0 - parent is root class. 859 parent == X:Y - parent is a node in hierarchy. 860 parent == 0:Y - parent is X:Y, where X:0 is qdisc. 861 862 handle == 0:0 - generate handle from kernel pool. 863 handle == 0:Y - class is X:Y, where X:0 is qdisc. 864 handle == X:Y - clear. 865 handle == X:0 - root class. 866 */ 867 868 /* Step 1. Determine qdisc handle X:0 */ 869 870 if (pid != TC_H_ROOT) { 871 u32 qid1 = TC_H_MAJ(pid); 872 873 if (qid && qid1) { 874 /* If both majors are known, they must be identical. */ 875 if (qid != qid1) 876 return -EINVAL; 877 } else if (qid1) { 878 qid = qid1; 879 } else if (qid == 0) 880 qid = dev->qdisc_sleeping->handle; 881 882 /* Now qid is genuine qdisc handle consistent 883 both with parent and child. 884 885 TC_H_MAJ(pid) still may be unspecified, complete it now. 886 */ 887 if (pid) 888 pid = TC_H_MAKE(qid, pid); 889 } else { 890 if (qid == 0) 891 qid = dev->qdisc_sleeping->handle; 892 } 893 894 /* OK. Locate qdisc */ 895 if ((q = qdisc_lookup(dev, qid)) == NULL) 896 return -ENOENT; 897 898 /* An check that it supports classes */ 899 cops = q->ops->cl_ops; 900 if (cops == NULL) 901 return -EINVAL; 902 903 /* Now try to get class */ 904 if (clid == 0) { 905 if (pid == TC_H_ROOT) 906 clid = qid; 907 } else 908 clid = TC_H_MAKE(qid, clid); 909 910 if (clid) 911 cl = cops->get(q, clid); 912 913 if (cl == 0) { 914 err = -ENOENT; 915 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) 916 goto out; 917 } else { 918 switch (n->nlmsg_type) { 919 case RTM_NEWTCLASS: 920 err = -EEXIST; 921 if (n->nlmsg_flags&NLM_F_EXCL) 922 goto out; 923 break; 924 case RTM_DELTCLASS: 925 err = cops->delete(q, cl); 926 if (err == 0) 927 tclass_notify(skb, n, q, cl, RTM_DELTCLASS); 928 goto out; 929 case RTM_GETTCLASS: 930 err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); 931 goto out; 932 default: 933 err = -EINVAL; 934 goto out; 935 } 936 } 937 938 new_cl = cl; 939 err = cops->change(q, clid, pid, tca, &new_cl); 940 if (err == 0) 941 tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); 942 943out: 944 if (cl) 945 cops->put(q, cl); 946 947 return err; 948} 949 950 951static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, 952 unsigned long cl, 953 u32 pid, u32 seq, unsigned flags, int event) 954{ 955 struct tcmsg *tcm; 956 struct nlmsghdr *nlh; 957 unsigned char *b = skb->tail; 958 959 nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); 960 nlh->nlmsg_flags = flags; 961 tcm = NLMSG_DATA(nlh); 962 tcm->tcm_family = AF_UNSPEC; 963 tcm->tcm_ifindex = q->dev ? q->dev->ifindex : 0; 964 tcm->tcm_parent = q->handle; 965 tcm->tcm_handle = q->handle; 966 tcm->tcm_info = 0; 967 RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); 968 if (q->ops->cl_ops->dump && q->ops->cl_ops->dump(q, cl, skb, tcm) < 0) 969 goto rtattr_failure; 970 nlh->nlmsg_len = skb->tail - b; 971 return skb->len; 972 973nlmsg_failure: 974rtattr_failure: 975 skb_trim(skb, b - skb->data); 976 return -1; 977} 978 979static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, 980 struct Qdisc *q, unsigned long cl, int event) 981{ 982 struct sk_buff *skb; 983 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; 984 985 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 986 if (!skb) 987 return -ENOBUFS; 988 989 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { 990 kfree_skb(skb); 991 return -EINVAL; 992 } 993 994 return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); 995} 996 997struct qdisc_dump_args 998{ 999 struct qdisc_walker w; 1000 struct sk_buff *skb; 1001 struct netlink_callback *cb; 1002}; 1003 1004static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) 1005{ 1006 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; 1007 1008 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, 1009 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); 1010} 1011 1012static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) 1013{ 1014 int t; 1015 int s_t; 1016 struct net_device *dev; 1017 struct Qdisc *q; 1018 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); 1019 struct qdisc_dump_args arg; 1020 1021 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) 1022 return 0; 1023 if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) 1024 return 0; 1025 1026 s_t = cb->args[0]; 1027 1028 read_lock(&qdisc_tree_lock); 1029 for (q=dev->qdisc_list, t=0; q; q = q->next, t++) { 1030 if (t < s_t) continue; 1031 if (!q->ops->cl_ops) continue; 1032 if (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle) 1033 continue; 1034 if (t > s_t) 1035 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); 1036 arg.w.fn = qdisc_class_dump; 1037 arg.skb = skb; 1038 arg.cb = cb; 1039 arg.w.stop = 0; 1040 arg.w.skip = cb->args[1]; 1041 arg.w.count = 0; 1042 q->ops->cl_ops->walk(q, &arg.w); 1043 cb->args[1] = arg.w.count; 1044 if (arg.w.stop) 1045 break; 1046 } 1047 read_unlock(&qdisc_tree_lock); 1048 1049 cb->args[0] = t; 1050 1051 dev_put(dev); 1052 return skb->len; 1053} 1054 1055int psched_us_per_tick = 1; 1056int psched_tick_per_us = 1; 1057 1058#ifdef CONFIG_PROC_FS 1059static int psched_read_proc(char *buffer, char **start, off_t offset, 1060 int length, int *eof, void *data) 1061{ 1062 int len; 1063 1064 len = sprintf(buffer, "%08x %08x %08x %08x\n", 1065 psched_tick_per_us, psched_us_per_tick, 1066 1000000, HZ); 1067 1068 len -= offset; 1069 1070 if (len > length) 1071 len = length; 1072 if(len < 0) 1073 len = 0; 1074 1075 *start = buffer + offset; 1076 *eof = 1; 1077 1078 return len; 1079} 1080#endif 1081 1082#if PSCHED_CLOCK_SOURCE == PSCHED_GETTIMEOFDAY 1083int psched_tod_diff(int delta_sec, int bound) 1084{ 1085 int delta; 1086 1087 if (bound <= 1000000 || delta_sec > (0x7FFFFFFF/1000000)-1) 1088 return bound; 1089 delta = delta_sec * 1000000; 1090 if (delta > bound) 1091 delta = bound; 1092 return delta; 1093} 1094#endif 1095 1096psched_time_t psched_time_base; 1097 1098#if PSCHED_CLOCK_SOURCE == PSCHED_CPU 1099psched_tdiff_t psched_clock_per_hz; 1100int psched_clock_scale; 1101#endif 1102 1103#ifdef PSCHED_WATCHER 1104PSCHED_WATCHER psched_time_mark; 1105 1106static void psched_tick(unsigned long); 1107 1108static struct timer_list psched_timer = 1109 { function: psched_tick }; 1110 1111static void psched_tick(unsigned long dummy) 1112{ 1113#if PSCHED_CLOCK_SOURCE == PSCHED_CPU 1114 psched_time_t dummy_stamp; 1115 PSCHED_GET_TIME(dummy_stamp); 1116 /* It is OK up to 4GHz cpu */ 1117 psched_timer.expires = jiffies + 1*HZ; 1118#else 1119 unsigned long now = jiffies; 1120 psched_time_base += ((u64)(now-psched_time_mark))<<PSCHED_JSCALE; 1121 psched_time_mark = now; 1122 psched_timer.expires = now + 60*60*HZ; 1123#endif 1124 add_timer(&psched_timer); 1125} 1126#endif 1127 1128#if PSCHED_CLOCK_SOURCE == PSCHED_CPU 1129int __init psched_calibrate_clock(void) 1130{ 1131 psched_time_t stamp, stamp1; 1132 struct timeval tv, tv1; 1133 psched_tdiff_t delay; 1134 long rdelay; 1135 unsigned long stop; 1136 1137#ifdef PSCHED_WATCHER 1138 psched_tick(0); 1139#endif 1140 stop = jiffies + HZ/10; 1141 PSCHED_GET_TIME(stamp); 1142 do_gettimeofday(&tv); 1143 while (time_before(jiffies, stop)) { 1144 barrier(); 1145 cpu_relax(); 1146 } 1147 PSCHED_GET_TIME(stamp1); 1148 do_gettimeofday(&tv1); 1149 1150 delay = PSCHED_TDIFF(stamp1, stamp); 1151 rdelay = tv1.tv_usec - tv.tv_usec; 1152 rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; 1153 if (rdelay > delay) 1154 return -1; 1155 delay /= rdelay; 1156 psched_tick_per_us = delay; 1157 while ((delay>>=1) != 0) 1158 psched_clock_scale++; 1159 psched_us_per_tick = 1<<psched_clock_scale; 1160 psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; 1161 return 0; 1162} 1163#endif 1164 1165int __init pktsched_init(void) 1166{ 1167 struct rtnetlink_link *link_p; 1168 1169#if PSCHED_CLOCK_SOURCE == PSCHED_CPU 1170 if (psched_calibrate_clock() < 0) 1171 return -1; 1172#elif PSCHED_CLOCK_SOURCE == PSCHED_JIFFIES 1173 psched_tick_per_us = HZ<<PSCHED_JSCALE; 1174 psched_us_per_tick = 1000000; 1175#ifdef PSCHED_WATCHER 1176 psched_tick(0); 1177#endif 1178#endif 1179 1180 link_p = rtnetlink_links[PF_UNSPEC]; 1181 1182 /* Setup rtnetlink links. It is made here to avoid 1183 exporting large number of public symbols. 1184 */ 1185 1186 if (link_p) { 1187 link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; 1188 link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; 1189 link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; 1190 link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; 1191 link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1192 link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1193 link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; 1194 link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; 1195 } 1196 1197#define INIT_QDISC(name) { \ 1198 extern struct Qdisc_ops name##_qdisc_ops; \ 1199 register_qdisc(& name##_qdisc_ops); \ 1200 } 1201 1202 INIT_QDISC(pfifo); 1203 INIT_QDISC(bfifo); 1204 1205#ifdef CONFIG_NET_SCH_CBQ 1206 INIT_QDISC(cbq); 1207#endif 1208#ifdef CONFIG_NET_SCH_HTB 1209 INIT_QDISC(htb); 1210#endif 1211#ifdef CONFIG_NET_SCH_CSZ 1212 INIT_QDISC(csz); 1213#endif 1214#ifdef CONFIG_NET_SCH_HPFQ 1215 INIT_QDISC(hpfq); 1216#endif 1217#ifdef CONFIG_NET_SCH_HFSC 1218 INIT_QDISC(hfsc); 1219#endif 1220#ifdef CONFIG_NET_SCH_RED 1221 INIT_QDISC(red); 1222#endif 1223#ifdef CONFIG_NET_SCH_GRED 1224 INIT_QDISC(gred); 1225#endif 1226#ifdef CONFIG_NET_SCH_INGRESS 1227 INIT_QDISC(ingress); 1228#endif 1229#ifdef CONFIG_NET_SCH_DSMARK 1230 INIT_QDISC(dsmark); 1231#endif 1232#ifdef CONFIG_NET_SCH_SFQ 1233 INIT_QDISC(sfq); 1234#endif 1235#ifdef CONFIG_NET_SCH_TBF 1236 INIT_QDISC(tbf); 1237#endif 1238#ifdef CONFIG_NET_SCH_TEQL 1239 teql_init(); 1240#endif 1241#ifdef CONFIG_NET_SCH_PRIO 1242 INIT_QDISC(prio); 1243#endif 1244#ifdef CONFIG_NET_SCH_ATM 1245 INIT_QDISC(atm); 1246#endif 1247#ifdef CONFIG_NET_CLS 1248 tc_filter_init(); 1249#endif 1250 1251#ifdef CONFIG_PROC_FS 1252 create_proc_read_entry("net/psched", 0, 0, psched_read_proc, NULL); 1253#endif 1254 1255 return 0; 1256} 1257