1/* 2 * Copyright (c) 2007-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* $OpenBSD: altq_rmclass.c,v 1.13 2007/09/13 20:40:02 chl Exp $ */ 30/* $KAME: altq_rmclass.c,v 1.10 2001/02/09 07:20:40 kjc Exp $ */ 31 32/* 33 * Copyright (c) 1991-1997 Regents of the University of California. 34 * All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 3. All advertising materials mentioning features or use of this software 45 * must display the following acknowledgement: 46 * This product includes software developed by the Network Research 47 * Group at Lawrence Berkeley Laboratory. 48 * 4. Neither the name of the University nor of the Laboratory may be used 49 * to endorse or promote products derived from this software without 50 * specific prior written permission. 51 * 52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 62 * SUCH DAMAGE. 63 * 64 * LBL code modified by speer@eng.sun.com, May 1977. 65 * For questions and/or comments, please send mail to cbq@ee.lbl.gov 66 */ 67 68#include <sys/cdefs.h> 69 70#ident "@(#)rm_class.c 1.48 97/12/05 SMI" 71 72#if PKTSCHED_CBQ 73 74#include <sys/param.h> 75#include <sys/malloc.h> 76#include <sys/mbuf.h> 77#include <sys/socket.h> 78#include <sys/systm.h> 79#include <sys/errno.h> 80#include <sys/time.h> 81#include <sys/kernel.h> 82#include <sys/kernel_types.h> 83#include <sys/syslog.h> 84 85#include <kern/zalloc.h> 86 87#include <net/if.h> 88#include <net/net_osdep.h> 89#include <net/pktsched/pktsched.h> 90#include <net/pktsched/pktsched_rmclass.h> 91#include <net/pktsched/pktsched_rmclass_debug.h> 92#include <net/classq/classq_red.h> 93#include <net/classq/classq_rio.h> 94#include <net/classq/classq_blue.h> 95#include <net/classq/classq_sfb.h> 96 97/* 98 * Local Macros 99 */ 100 101#define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } 102 103/* 104 * Local routines. 105 */ 106 107static int rmc_satisfied(struct rm_class *, struct timeval *); 108static void rmc_wrr_set_weights(struct rm_ifdat *); 109static void rmc_depth_compute(struct rm_class *); 110static void rmc_depth_recompute(rm_class_t *); 111 112static struct mbuf *_rmc_wrr_dequeue_next(struct rm_ifdat *, cqdq_op_t); 113static struct mbuf *_rmc_prr_dequeue_next(struct rm_ifdat *, cqdq_op_t); 114 115static int _rmc_addq(rm_class_t *, struct mbuf *, struct pf_mtag *); 116static void _rmc_dropq(rm_class_t *); 117static struct mbuf *_rmc_getq(rm_class_t *); 118static struct mbuf *_rmc_pollq(rm_class_t *); 119 120static int rmc_under_limit(struct rm_class *, struct timeval *); 121static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); 122static void rmc_drop_action(struct rm_class *); 123static void rmc_restart(struct rm_class *); 124static void rmc_root_overlimit(rm_class_t *, rm_class_t *); 125 126#define RMC_ZONE_MAX 32 /* maximum elements in zone */ 127#define RMC_ZONE_NAME "pktsched_cbq_cl" /* zone name (CBQ for now) */ 128 129static unsigned int rmc_size; /* size of zone element */ 130static struct zone *rmc_zone; /* zone for rm_class */ 131 132void 133rmclass_init(void) 134{ 135 if (rmc_zone != NULL) 136 return; 137 138 rmc_size = sizeof (struct rm_class); 139 rmc_zone = zinit(rmc_size, RMC_ZONE_MAX * rmc_size, 0, RMC_ZONE_NAME); 140 if (rmc_zone == NULL) { 141 panic("%s: failed allocating %s", __func__, RMC_ZONE_NAME); 142 /* NOTREACHED */ 143 } 144 zone_change(rmc_zone, Z_EXPAND, TRUE); 145 zone_change(rmc_zone, Z_CALLERACCT, TRUE); 146} 147 148#define BORROW_OFFTIME 149/* 150 * BORROW_OFFTIME (experimental): 151 * borrow the offtime of the class borrowing from. 152 * the reason is that when its own offtime is set, the class is unable 153 * to borrow much, especially when cutoff is taking effect. 154 * but when the borrowed class is overloaded (advidle is close to minidle), 155 * use the borrowing class's offtime to avoid overload. 156 */ 157#define ADJUST_CUTOFF 158/* 159 * ADJUST_CUTOFF (experimental): 160 * if no underlimit class is found due to cutoff, increase cutoff and 161 * retry the scheduling loop. 162 * also, don't invoke delay_actions while cutoff is taking effect, 163 * since a sleeping class won't have a chance to be scheduled in the 164 * next loop. 165 * 166 * now heuristics for setting the top-level variable (cutoff_) becomes: 167 * 1. if a packet arrives for a not-overlimit class, set cutoff 168 * to the depth of the class. 169 * 2. if cutoff is i, and a packet arrives for an overlimit class 170 * with an underlimit ancestor at a lower level than i (say j), 171 * then set cutoff to j. 172 * 3. at scheduling a packet, if there is no underlimit class 173 * due to the current cutoff level, increase cutoff by 1 and 174 * then try to schedule again. 175 */ 176 177/* 178 * rm_class_t * 179 * rmc_newclass(...) - Create a new resource management class at priority 180 * 'pri' on the interface given by 'ifd'. 181 * 182 * nsecPerByte is the data rate of the interface in nanoseconds/byte. 183 * E.g., 800 for a 10Mb/s ethernet. If the class gets less 184 * than 100% of the bandwidth, this number should be the 185 * 'effective' rate for the class. Let f be the 186 * bandwidth fraction allocated to this class, and let 187 * nsPerByte be the data rate of the output link in 188 * nanoseconds/byte. Then nsecPerByte is set to 189 * nsPerByte / f. E.g., 1600 (= 800 / .5) 190 * for a class that gets 50% of an ethernet's bandwidth. 191 * 192 * action the routine to call when the class is over limit. 193 * 194 * maxq max allowable queue size for class (in packets). 195 * 196 * parent parent class pointer. 197 * 198 * borrow class to borrow from (should be either 'parent' or null). 199 * 200 * maxidle max value allowed for class 'idle' time estimate (this 201 * parameter determines how large an initial burst of packets 202 * can be before overlimit action is invoked. 203 * 204 * offtime how long 'delay' action will delay when class goes over 205 * limit (this parameter determines the steady-state burst 206 * size when a class is running over its limit). 207 * 208 * Maxidle and offtime have to be computed from the following: If the 209 * average packet size is s, the bandwidth fraction allocated to this 210 * class is f, we want to allow b packet bursts, and the gain of the 211 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: 212 * 213 * ptime = s * nsPerByte * (1 - f) / f 214 * maxidle = ptime * (1 - g^b) / g^b 215 * minidle = -ptime * (1 / (f - 1)) 216 * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) 217 * 218 * Operationally, it's convenient to specify maxidle & offtime in units 219 * independent of the link bandwidth so the maxidle & offtime passed to 220 * this routine are the above values multiplied by 8*f/(1000*nsPerByte). 221 * (The constant factor is a scale factor needed to make the parameters 222 * integers. This scaling also means that the 'unscaled' values of 223 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, 224 * not nanoseconds.) Also note that the 'idle' filter computation keeps 225 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of 226 * maxidle also must be scaled upward by this value. Thus, the passed 227 * values for maxidle and offtime can be computed as follows: 228 * 229 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) 230 * offtime = offtime * 8 / (1000 * nsecPerByte) 231 * 232 * When USE_HRTIME is employed, then maxidle and offtime become: 233 * maxidle = maxilde * (8.0 / nsecPerByte); 234 * offtime = offtime * (8.0 / nsecPerByte); 235 */ 236struct rm_class * 237rmc_newclass(int pri, struct rm_ifdat *ifd, u_int32_t nsecPerByte, 238 void (*action)(rm_class_t *, rm_class_t *), u_int32_t qid, u_int32_t maxq, 239 struct rm_class *parent, struct rm_class *borrow, u_int32_t maxidle, 240 int minidle, u_int32_t offtime, int pktsize, int flags) 241{ 242 struct ifnet *ifp; 243 struct ifclassq *ifq; 244 struct rm_class *cl; 245 struct rm_class *peer; 246 247 if (nsecPerByte == 0) { 248 log(LOG_ERR, "%s: invalid inverse data rate\n", __func__); 249 return (NULL); 250 } 251 252 if (pri >= RM_MAXPRIO) { 253 log(LOG_ERR, "%s: priority %d out of range! (max %d)\n", 254 __func__, pri, RM_MAXPRIO - 1); 255 return (NULL); 256 } 257 258#if !CLASSQ_RED 259 if (flags & RMCF_RED) { 260 log(LOG_ERR, "%s: RED not configured for CBQ!\n", __func__); 261 return (NULL); 262 } 263#endif /* !CLASSQ_RED */ 264 265#if !CLASSQ_RIO 266 if (flags & RMCF_RIO) { 267 log(LOG_ERR, "%s: RIO not configured for CBQ!\n", __func__); 268 return (NULL); 269 } 270#endif /* CLASSQ_RIO */ 271 272#if !CLASSQ_BLUE 273 if (flags & RMCF_BLUE) { 274 log(LOG_ERR, "%s: BLUE not configured for CBQ!\n", __func__); 275 return (NULL); 276 } 277#endif /* CLASSQ_BLUE */ 278 279 /* These are mutually exclusive */ 280 if ((flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) && 281 (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_RED && 282 (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_RIO && 283 (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_BLUE && 284 (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_SFB) { 285 log(LOG_ERR, "%s: RED|RIO|BLUE|SFB mutually exclusive\n", 286 __func__); 287 return (NULL); 288 } 289 290 cl = zalloc(rmc_zone); 291 if (cl == NULL) 292 return (NULL); 293 294 bzero(cl, rmc_size); 295 CALLOUT_INIT(&cl->callout_); 296 297 /* 298 * Class initialization. 299 */ 300 cl->children_ = NULL; 301 cl->parent_ = parent; 302 cl->borrow_ = borrow; 303 cl->leaf_ = 1; 304 cl->ifdat_ = ifd; 305 cl->pri_ = pri; 306 cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ 307 cl->depth_ = 0; 308 cl->qthresh_ = 0; 309 cl->ns_per_byte_ = nsecPerByte; 310 311 ifq = ifd->ifq_; 312 ifp = ifq->ifcq_ifp; 313 314 if (maxq == 0 || maxq > IFCQ_MAXLEN(ifq)) { 315 maxq = IFCQ_MAXLEN(ifq); 316 if (maxq == 0) 317 maxq = DEFAULT_QLIMIT; /* use default */ 318 } 319 _qinit(&cl->q_, Q_DROPHEAD, maxq); 320 321 cl->flags_ = flags; 322 323 cl->minidle_ = (minidle * (int)nsecPerByte) / 8; 324 if (cl->minidle_ > 0) 325 cl->minidle_ = 0; 326 327 cl->maxidle_ = (maxidle * nsecPerByte) / 8; 328 if (cl->maxidle_ == 0) 329 cl->maxidle_ = 1; 330 331 cl->avgidle_ = cl->maxidle_; 332 cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; 333 if (cl->offtime_ == 0) 334 cl->offtime_ = 1; 335 336 cl->overlimit = action; 337 338 if (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) { 339 int pkttime; 340 341 cl->qflags_ = 0; 342 if (flags & RMCF_ECN) { 343 if (flags & RMCF_BLUE) 344 cl->qflags_ |= BLUEF_ECN; 345 else if (flags & RMCF_SFB) 346 cl->qflags_ |= SFBF_ECN; 347 else if (flags & RMCF_RED) 348 cl->qflags_ |= REDF_ECN; 349 else if (flags & RMCF_RIO) 350 cl->qflags_ |= RIOF_ECN; 351 } 352 if (flags & RMCF_FLOWCTL) { 353 if (flags & RMCF_SFB) 354 cl->qflags_ |= SFBF_FLOWCTL; 355 } 356 if (flags & RMCF_FLOWVALVE) { 357 if (flags & RMCF_RED) 358 cl->qflags_ |= REDF_FLOWVALVE; 359 } 360 if (flags & RMCF_CLEARDSCP) { 361 if (flags & RMCF_RIO) 362 cl->qflags_ |= RIOF_CLEARDSCP; 363 } 364 pkttime = nsecPerByte * pktsize / 1000; 365 366 /* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */ 367#if CLASSQ_RED 368 if (flags & RMCF_RED) { 369 cl->red_ = red_alloc(ifp, 0, 0, 370 qlimit(&cl->q_) * 10/100, 371 qlimit(&cl->q_) * 30/100, 372 cl->qflags_, pkttime); 373 if (cl->red_ != NULL) 374 qtype(&cl->q_) = Q_RED; 375 } 376#endif /* CLASSQ_RED */ 377#if CLASSQ_RIO 378 if (flags & RMCF_RIO) { 379 cl->rio_ = 380 rio_alloc(ifp, 0, NULL, cl->qflags_, pkttime); 381 if (cl->rio_ != NULL) 382 qtype(&cl->q_) = Q_RIO; 383 } 384#endif /* CLASSQ_RIO */ 385#if CLASSQ_BLUE 386 if (flags & RMCF_BLUE) { 387 cl->blue_ = blue_alloc(ifp, 0, 0, cl->qflags_); 388 if (cl->blue_ != NULL) 389 qtype(&cl->q_) = Q_BLUE; 390 } 391#endif /* CLASSQ_BLUE */ 392 if (flags & RMCF_SFB) { 393 if (!(cl->flags_ & RMCF_LAZY)) 394 cl->sfb_ = sfb_alloc(ifp, qid, 395 qlimit(&cl->q_), cl->qflags_); 396 if (cl->sfb_ != NULL || (cl->flags_ & RMCF_LAZY)) 397 qtype(&cl->q_) = Q_SFB; 398 } 399 } 400 401 /* 402 * put the class into the class tree 403 */ 404 if ((peer = ifd->active_[pri]) != NULL) { 405 /* find the last class at this pri */ 406 cl->peer_ = peer; 407 while (peer->peer_ != ifd->active_[pri]) 408 peer = peer->peer_; 409 peer->peer_ = cl; 410 } else { 411 ifd->active_[pri] = cl; 412 cl->peer_ = cl; 413 } 414 415 if (cl->parent_) { 416 cl->next_ = parent->children_; 417 parent->children_ = cl; 418 parent->leaf_ = 0; 419 } 420 421 /* 422 * Compute the depth of this class and its ancestors in the class 423 * hierarchy. 424 */ 425 rmc_depth_compute(cl); 426 427 /* 428 * If CBQ's WRR is enabled, then initialize the class WRR state. 429 */ 430 if (ifd->wrr_) { 431 ifd->num_[pri]++; 432 ifd->alloc_[pri] += cl->allotment_; 433 rmc_wrr_set_weights(ifd); 434 } 435 return (cl); 436} 437 438int 439rmc_modclass(struct rm_class *cl, u_int32_t nsecPerByte, int maxq, 440 u_int32_t maxidle, int minidle, u_int32_t offtime, int pktsize) 441{ 442#pragma unused(pktsize) 443 struct rm_ifdat *ifd; 444 u_int32_t old_allotment; 445 446 ifd = cl->ifdat_; 447 old_allotment = cl->allotment_; 448 449 cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ 450 cl->qthresh_ = 0; 451 cl->ns_per_byte_ = nsecPerByte; 452 453 qlimit(&cl->q_) = maxq; 454 455 cl->minidle_ = (minidle * nsecPerByte) / 8; 456 if (cl->minidle_ > 0) 457 cl->minidle_ = 0; 458 459 cl->maxidle_ = (maxidle * nsecPerByte) / 8; 460 if (cl->maxidle_ == 0) 461 cl->maxidle_ = 1; 462 463 cl->avgidle_ = cl->maxidle_; 464 cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; 465 if (cl->offtime_ == 0) 466 cl->offtime_ = 1; 467 468 /* 469 * If CBQ's WRR is enabled, then initialize the class WRR state. 470 */ 471 if (ifd->wrr_) { 472 ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; 473 rmc_wrr_set_weights(ifd); 474 } 475 return (0); 476} 477 478/* 479 * static void 480 * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes 481 * the appropriate run robin weights for the CBQ weighted round robin 482 * algorithm. 483 * 484 * Returns: NONE 485 */ 486 487static void 488rmc_wrr_set_weights(struct rm_ifdat *ifd) 489{ 490 int i; 491 struct rm_class *cl, *clh; 492 493 for (i = 0; i < RM_MAXPRIO; i++) { 494 /* 495 * This is inverted from that of the simulator to 496 * maintain precision. 497 */ 498 if (ifd->num_[i] == 0) { 499 ifd->M_[i] = 0; 500 } else { 501 ifd->M_[i] = 502 ifd->alloc_[i] / (ifd->num_[i] * ifd->maxpkt_); 503 } 504 /* 505 * Compute the weighted allotment for each class. 506 * This takes the expensive div instruction out 507 * of the main loop for the wrr scheduling path. 508 * These only get recomputed when a class comes or 509 * goes. 510 */ 511 if (ifd->active_[i] != NULL) { 512 clh = cl = ifd->active_[i]; 513 do { 514 /* safe-guard for slow link or alloc_ == 0 */ 515 if (ifd->M_[i] == 0) { 516 cl->w_allotment_ = 0; 517 } else { 518 cl->w_allotment_ = 519 cl->allotment_ / ifd->M_[i]; 520 } 521 cl = cl->peer_; 522 } while ((cl != NULL) && (cl != clh)); 523 } 524 } 525} 526 527int 528rmc_get_weight(struct rm_ifdat *ifd, int pri) 529{ 530 if ((pri >= 0) && (pri < RM_MAXPRIO)) 531 return (ifd->M_[pri]); 532 else 533 return (0); 534} 535 536/* 537 * static void 538 * rmc_depth_compute(struct rm_class *cl) - This function computes the 539 * appropriate depth of class 'cl' and its ancestors. 540 * 541 * Returns: NONE 542 */ 543 544static void 545rmc_depth_compute(struct rm_class *cl) 546{ 547 rm_class_t *t = cl, *p; 548 549 /* 550 * Recompute the depth for the branch of the tree. 551 */ 552 while (t != NULL) { 553 p = t->parent_; 554 if (p && (t->depth_ >= p->depth_)) { 555 p->depth_ = t->depth_ + 1; 556 t = p; 557 } else 558 t = NULL; 559 } 560} 561 562/* 563 * static void 564 * rmc_depth_recompute(struct rm_class *cl) - This function re-computes 565 * the depth of the tree after a class has been deleted. 566 * 567 * Returns: NONE 568 */ 569 570static void 571rmc_depth_recompute(rm_class_t *cl) 572{ 573 rm_class_t *p, *t; 574 575 p = cl; 576 while (p != NULL) { 577 if ((t = p->children_) == NULL) { 578 p->depth_ = 0; 579 } else { 580 int cdepth = 0; 581 582 while (t != NULL) { 583 if (t->depth_ > cdepth) 584 cdepth = t->depth_; 585 t = t->next_; 586 } 587 588 if (p->depth_ == cdepth + 1) 589 /* no change to this parent */ 590 return; 591 592 p->depth_ = cdepth + 1; 593 } 594 595 p = p->parent_; 596 } 597} 598 599/* 600 * void 601 * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This 602 * function deletes a class from the link-sharing structure and frees 603 * all resources associated with the class. 604 * 605 * Returns: NONE 606 */ 607 608void 609rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) 610{ 611 struct rm_class *p, *head, *previous; 612 613 VERIFY(cl->children_ == NULL); 614 615 if (cl->sleeping_) 616 CALLOUT_STOP(&cl->callout_); 617 618 /* 619 * Free packets in the packet queue. 620 * XXX - this may not be a desired behavior. Packets should be 621 * re-queued. 622 */ 623 rmc_dropall(cl); 624 625 /* 626 * If the class has a parent, then remove the class from the 627 * class from the parent's children chain. 628 */ 629 if (cl->parent_ != NULL) { 630 head = cl->parent_->children_; 631 p = previous = head; 632 if (head->next_ == NULL) { 633 VERIFY(head == cl); 634 cl->parent_->children_ = NULL; 635 cl->parent_->leaf_ = 1; 636 } else while (p != NULL) { 637 if (p == cl) { 638 if (cl == head) 639 cl->parent_->children_ = cl->next_; 640 else 641 previous->next_ = cl->next_; 642 cl->next_ = NULL; 643 p = NULL; 644 } else { 645 previous = p; 646 p = p->next_; 647 } 648 } 649 } 650 651 /* 652 * Delete class from class priority peer list. 653 */ 654 if ((p = ifd->active_[cl->pri_]) != NULL) { 655 /* 656 * If there is more than one member of this priority 657 * level, then look for class(cl) in the priority level. 658 */ 659 if (p != p->peer_) { 660 while (p->peer_ != cl) 661 p = p->peer_; 662 p->peer_ = cl->peer_; 663 664 if (ifd->active_[cl->pri_] == cl) 665 ifd->active_[cl->pri_] = cl->peer_; 666 } else { 667 VERIFY(p == cl); 668 ifd->active_[cl->pri_] = NULL; 669 } 670 } 671 672 /* 673 * Recompute the WRR weights. 674 */ 675 if (ifd->wrr_) { 676 ifd->alloc_[cl->pri_] -= cl->allotment_; 677 ifd->num_[cl->pri_]--; 678 rmc_wrr_set_weights(ifd); 679 } 680 681 /* 682 * Re-compute the depth of the tree. 683 */ 684 rmc_depth_recompute(cl->parent_); 685 686 /* 687 * Free the class structure. 688 */ 689 if (cl->qalg_.ptr != NULL) { 690#if CLASSQ_RIO 691 if (q_is_rio(&cl->q_)) 692 rio_destroy(cl->rio_); 693#endif /* CLASSQ_RIO */ 694#if CLASSQ_RED 695 if (q_is_red(&cl->q_)) 696 red_destroy(cl->red_); 697#endif /* CLASSQ_RED */ 698#if CLASSQ_BLUE 699 if (q_is_blue(&cl->q_)) 700 blue_destroy(cl->blue_); 701#endif /* CLASSQ_BLUE */ 702 if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) 703 sfb_destroy(cl->sfb_); 704 cl->qalg_.ptr = NULL; 705 qtype(&cl->q_) = Q_DROPTAIL; 706 qstate(&cl->q_) = QS_RUNNING; 707 } 708 zfree(rmc_zone, cl); 709} 710 711 712/* 713 * int 714 * rmc_init(...) - Initialize the resource management data structures 715 * associated with the output portion of interface 'ifp'. 'ifd' is 716 * where the structures will be built (for backwards compatibility, the 717 * structures aren't kept in the ifnet struct). 'nsecPerByte' 718 * gives the link speed (inverse of bandwidth) in nanoseconds/byte. 719 * 'restart' is the driver-specific routine that the generic 'delay 720 * until under limit' action will call to restart output. `maxq' 721 * is the queue size of the 'link' & 'default' classes. 'maxqueued' 722 * is the maximum number of packets that the resource management 723 * code will allow to be queued 'downstream' (this is typically 1). 724 * 725 * Returns: 0 on success 726 */ 727 728int 729rmc_init(struct ifclassq *ifq, struct rm_ifdat *ifd, u_int32_t nsecPerByte, 730 void (*restart)(struct ifclassq *), u_int32_t qid, int maxq, int maxqueued, 731 u_int32_t maxidle, int minidle, u_int32_t offtime, int flags) 732{ 733 struct ifnet *ifp = ifq->ifcq_ifp; 734 int i, mtu; 735 736 /* 737 * Initialize the CBQ tracing/debug facility. 738 */ 739 CBQTRACEINIT(); 740 741 if (nsecPerByte == 0) { 742 log(LOG_ERR, "%s: %s: invalid inverse data rate)\n", 743 __func__, if_name(ifp)); 744 return (EINVAL); 745 } 746 747 mtu = ifp->if_mtu; 748 if (mtu < 1) { 749 log(LOG_ERR, "%s: %s: invalid MTU (interface not " 750 "initialized?)\n", __func__, if_name(ifp)); 751 return (EINVAL); 752 } 753 bzero((char *)ifd, sizeof (*ifd)); 754 755 ifd->ifq_ = ifq; 756 ifd->restart = restart; 757 ifd->maxqueued_ = maxqueued; 758 ifd->ns_per_byte_ = nsecPerByte; 759 ifd->maxpkt_ = mtu; 760 ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; 761 ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; 762#if 1 763 ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; 764 if (mtu * nsecPerByte > 10 * 1000000) 765 ifd->maxiftime_ /= 4; 766#endif 767 768 reset_cutoff(ifd); 769 CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); 770 771 /* 772 * Initialize the CBQ's WRR state. 773 */ 774 for (i = 0; i < RM_MAXPRIO; i++) { 775 ifd->alloc_[i] = 0; 776 ifd->M_[i] = 0; 777 ifd->num_[i] = 0; 778 ifd->na_[i] = 0; 779 ifd->active_[i] = NULL; 780 } 781 782 /* 783 * Initialize current packet state. 784 */ 785 ifd->qi_ = 0; 786 ifd->qo_ = 0; 787 for (i = 0; i < RM_MAXQUEUED; i++) { 788 ifd->class_[i] = NULL; 789 ifd->curlen_[i] = 0; 790 ifd->borrowed_[i] = NULL; 791 } 792 793 /* 794 * Create the root class of the link-sharing structure. 795 */ 796 if ((ifd->root_ = rmc_newclass(0, ifd, nsecPerByte, 797 rmc_root_overlimit, qid, maxq, 0, 0, maxidle, minidle, offtime, 798 0, 0)) == NULL) { 799 log(LOG_ERR, "rmc_init: root class not allocated\n"); 800 return (ENOMEM); 801 } 802 ifd->root_->depth_ = 0; 803 804 return (0); 805} 806 807/* 808 * void 809 * rmc_queue_packet(struct rm_class *cl, struct mbuf *m) - Add packet given by 810 * mbuf 'm' to queue for resource class 'cl'. This routine is called 811 * by a driver's if_output routine. This routine must be called with 812 * output packet completion interrupts locked out (to avoid racing with 813 * rmc_dequeue_next). 814 * 815 * Returns: 0 on successful queueing 816 * CLASSQEQ_DROPPED when packet drop occurs 817 */ 818int 819rmc_queue_packet(struct rm_class *cl, struct mbuf *m, struct pf_mtag *t) 820{ 821 struct timeval now; 822 struct rm_ifdat *ifd = cl->ifdat_; 823 int cpri = cl->pri_; 824 int is_empty = qempty(&cl->q_); 825 int ret = 0; 826 827 RM_GETTIME(now); 828 if (ifd->cutoff_ > 0) { 829 if (TV_LT(&cl->undertime_, &now)) { 830 if (ifd->cutoff_ > cl->depth_) 831 ifd->cutoff_ = cl->depth_; 832 CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); 833 } else { 834 /* 835 * the class is overlimit. if the class has 836 * underlimit ancestors, set cutoff to the lowest 837 * depth among them. 838 */ 839 struct rm_class *borrow = cl->borrow_; 840 841 while (borrow != NULL && 842 borrow->depth_ < ifd->cutoff_) { 843 if (TV_LT(&borrow->undertime_, &now)) { 844 ifd->cutoff_ = borrow->depth_; 845 CBQTRACE(rmc_queue_packet, 'ffob', 846 ifd->cutoff_); 847 break; 848 } 849 borrow = borrow->borrow_; 850 } 851 } 852 } 853 854 ret = _rmc_addq(cl, m, t); 855 if (ret != 0 && 856 (ret == CLASSQEQ_DROPPED || ret == CLASSQEQ_DROPPED_FC || 857 ret == CLASSQEQ_DROPPED_SP)) { 858 /* failed */ 859 return (ret); 860 } 861 VERIFY(ret == 0 || ret == CLASSQEQ_SUCCESS_FC); 862 if (is_empty) { 863 CBQTRACE(rmc_queue_packet, 'type', cl->stats_.handle); 864 ifd->na_[cpri]++; 865 } 866 867 if (qlen(&cl->q_) > qlimit(&cl->q_)) { 868 /* note: qlimit can be set to 0 or 1 */ 869 rmc_drop_action(cl); 870 return (CLASSQEQ_DROPPED); 871 } 872 return (ret); 873} 874 875/* 876 * void 877 * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all 878 * classes to see if there are satified. 879 */ 880 881static void 882rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) 883{ 884 int i; 885 rm_class_t *p, *bp; 886 887 for (i = RM_MAXPRIO - 1; i >= 0; i--) { 888 if ((bp = ifd->active_[i]) != NULL) { 889 p = bp; 890 do { 891 if (!rmc_satisfied(p, now)) { 892 ifd->cutoff_ = p->depth_; 893 return; 894 } 895 p = p->peer_; 896 } while (p != bp); 897 } 898 } 899 900 reset_cutoff(ifd); 901} 902 903/* 904 * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. 905 */ 906 907static int 908rmc_satisfied(struct rm_class *cl, struct timeval *now) 909{ 910 rm_class_t *p; 911 912 if (cl == NULL) 913 return (1); 914 if (TV_LT(now, &cl->undertime_)) 915 return (1); 916 if (cl->depth_ == 0) { 917 if (!cl->sleeping_ && (qlen(&cl->q_) > cl->qthresh_)) 918 return (0); 919 else 920 return (1); 921 } 922 if (cl->children_ != NULL) { 923 p = cl->children_; 924 while (p != NULL) { 925 if (!rmc_satisfied(p, now)) 926 return (0); 927 p = p->next_; 928 } 929 } 930 931 return (1); 932} 933 934/* 935 * Return 1 if class 'cl' is under limit or can borrow from a parent, 936 * 0 if overlimit. As a side-effect, this routine will invoke the 937 * class overlimit action if the class if overlimit. 938 */ 939 940static int 941rmc_under_limit(struct rm_class *cl, struct timeval *now) 942{ 943 rm_class_t *p = cl; 944 rm_class_t *top; 945 struct rm_ifdat *ifd = cl->ifdat_; 946 947 ifd->borrowed_[ifd->qi_] = NULL; 948 /* 949 * If cl is the root class, then always return that it is 950 * underlimit. Otherwise, check to see if the class is underlimit. 951 */ 952 if (cl->parent_ == NULL) 953 return (1); 954 955 if (cl->sleeping_) { 956 if (TV_LT(now, &cl->undertime_)) 957 return (0); 958 959 CALLOUT_STOP(&cl->callout_); 960 cl->sleeping_ = 0; 961 cl->undertime_.tv_sec = 0; 962 return (1); 963 } 964 965 top = NULL; 966 while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { 967 if (((cl = cl->borrow_) == NULL) || 968 (cl->depth_ > ifd->cutoff_)) { 969#ifdef ADJUST_CUTOFF 970 if (cl != NULL) 971 /* 972 * cutoff is taking effect, just 973 * return false without calling 974 * the delay action. 975 */ 976 return (0); 977#endif 978#ifdef BORROW_OFFTIME 979 /* 980 * check if the class can borrow offtime too. 981 * borrow offtime from the top of the borrow 982 * chain if the top class is not overloaded. 983 */ 984 if (cl != NULL) { 985 /* 986 * cutoff is taking effect, use this 987 * class as top. 988 */ 989 top = cl; 990 CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); 991 } 992 if (top != NULL && top->avgidle_ == top->minidle_) 993 top = NULL; 994 p->overtime_ = *now; 995 (p->overlimit)(p, top); 996#else 997 p->overtime_ = *now; 998 (p->overlimit)(p, NULL); 999#endif 1000 return (0); 1001 } 1002 top = cl; 1003 } 1004 1005 if (cl != p) 1006 ifd->borrowed_[ifd->qi_] = cl; 1007 return (1); 1008} 1009 1010/* 1011 * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to 1012 * Packet-by-packet round robin. 1013 * 1014 * The heart of the weighted round-robin scheduler, which decides which 1015 * class next gets to send a packet. Highest priority first, then 1016 * weighted round-robin within priorites. 1017 * 1018 * Each able-to-send class gets to send until its byte allocation is 1019 * exhausted. Thus, the active pointer is only changed after a class has 1020 * exhausted its allocation. 1021 * 1022 * If the scheduler finds no class that is underlimit or able to borrow, 1023 * then the first class found that had a nonzero queue and is allowed to 1024 * borrow gets to send. 1025 */ 1026 1027static struct mbuf * 1028_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t op) 1029{ 1030 struct rm_class *cl = NULL, *first = NULL; 1031 u_int32_t deficit; 1032 int cpri; 1033 struct mbuf *m; 1034 struct timeval now; 1035 1036 RM_GETTIME(now); 1037 1038 /* 1039 * if the driver polls the top of the queue and then removes 1040 * the polled packet, we must return the same packet. 1041 */ 1042 if (op == CLASSQDQ_REMOVE && ifd->pollcache_) { 1043 cl = ifd->pollcache_; 1044 cpri = cl->pri_; 1045 if (ifd->efficient_) { 1046 /* check if this class is overlimit */ 1047 if (cl->undertime_.tv_sec != 0 && 1048 rmc_under_limit(cl, &now) == 0) 1049 first = cl; 1050 } 1051 ifd->pollcache_ = NULL; 1052 goto _wrr_out; 1053 } else { 1054 /* mode == CLASSQDQ_POLL || pollcache == NULL */ 1055 ifd->pollcache_ = NULL; 1056 ifd->borrowed_[ifd->qi_] = NULL; 1057 } 1058#ifdef ADJUST_CUTOFF 1059_again: 1060#endif 1061 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 1062 if (ifd->na_[cpri] == 0) 1063 continue; 1064 deficit = 0; 1065 /* 1066 * Loop through twice for a priority level, if some class 1067 * was unable to send a packet the first round because 1068 * of the weighted round-robin mechanism. 1069 * During the second loop at this level, deficit==2. 1070 * (This second loop is not needed if for every class, 1071 * "M[cl->pri_])" times "cl->allotment" is greater than 1072 * the byte size for the largest packet in the class.) 1073 */ 1074_wrr_loop: 1075 cl = ifd->active_[cpri]; 1076 VERIFY(cl != NULL); 1077 do { 1078 if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) 1079 cl->bytes_alloc_ += cl->w_allotment_; 1080 if (!qempty(&cl->q_)) { 1081 if ((cl->undertime_.tv_sec == 0) || 1082 rmc_under_limit(cl, &now)) { 1083 if (cl->bytes_alloc_ > 0 || deficit > 1) 1084 goto _wrr_out; 1085 1086 /* underlimit but no alloc */ 1087 deficit = 1; 1088#if 1 1089 ifd->borrowed_[ifd->qi_] = NULL; 1090#endif 1091 } else if (first == NULL && cl->borrow_ != NULL) 1092 first = cl; /* borrowing candidate */ 1093 } 1094 1095 cl->bytes_alloc_ = 0; 1096 cl = cl->peer_; 1097 } while (cl != ifd->active_[cpri]); 1098 1099 if (deficit == 1) { 1100 /* first loop found an underlimit class with deficit */ 1101 /* Loop on same priority level, with new deficit. */ 1102 deficit = 2; 1103 goto _wrr_loop; 1104 } 1105 } 1106 1107#ifdef ADJUST_CUTOFF 1108 /* 1109 * no underlimit class found. if cutoff is taking effect, 1110 * increase cutoff and try again. 1111 */ 1112 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1113 ifd->cutoff_++; 1114 CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); 1115 goto _again; 1116 } 1117#endif /* ADJUST_CUTOFF */ 1118 /* 1119 * If LINK_EFFICIENCY is turned on, then the first overlimit 1120 * class we encounter will send a packet if all the classes 1121 * of the link-sharing structure are overlimit. 1122 */ 1123 reset_cutoff(ifd); 1124 CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); 1125 1126 if (!ifd->efficient_ || first == NULL) 1127 return (NULL); 1128 1129 cl = first; 1130 cpri = cl->pri_; 1131#if 0 /* too time-consuming for nothing */ 1132 if (cl->sleeping_) 1133 CALLOUT_STOP(&cl->callout_); 1134 cl->sleeping_ = 0; 1135 cl->undertime_.tv_sec = 0; 1136#endif 1137 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1138 ifd->cutoff_ = cl->borrow_->depth_; 1139 1140 /* 1141 * Deque the packet and do the book keeping... 1142 */ 1143_wrr_out: 1144 if (op == CLASSQDQ_REMOVE) { 1145 m = _rmc_getq(cl); 1146 if (m == NULL) 1147 return (NULL); 1148 1149 if (qempty(&cl->q_)) 1150 ifd->na_[cpri]--; 1151 1152 /* 1153 * Update class statistics and link data. 1154 */ 1155 if (cl->bytes_alloc_ > 0) 1156 cl->bytes_alloc_ -= m_pktlen(m); 1157 1158 if ((cl->bytes_alloc_ <= 0) || first == cl) 1159 ifd->active_[cl->pri_] = cl->peer_; 1160 else 1161 ifd->active_[cl->pri_] = cl; 1162 1163 ifd->class_[ifd->qi_] = cl; 1164 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1165 ifd->now_[ifd->qi_] = now; 1166 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1167 ifd->queued_++; 1168 } else { 1169 /* mode == ALTDQ_PPOLL */ 1170 m = _rmc_pollq(cl); 1171 ifd->pollcache_ = cl; 1172 } 1173 return (m); 1174} 1175 1176/* 1177 * Dequeue & return next packet from the highest priority class that 1178 * has a packet to send & has enough allocation to send it. This 1179 * routine is called by a driver whenever it needs a new packet to 1180 * output. 1181 */ 1182static struct mbuf * 1183_rmc_prr_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t op) 1184{ 1185 struct mbuf *m; 1186 int cpri; 1187 struct rm_class *cl, *first = NULL; 1188 struct timeval now; 1189 1190 RM_GETTIME(now); 1191 1192 /* 1193 * if the driver polls the top of the queue and then removes 1194 * the polled packet, we must return the same packet. 1195 */ 1196 if (op == CLASSQDQ_REMOVE && ifd->pollcache_) { 1197 cl = ifd->pollcache_; 1198 cpri = cl->pri_; 1199 ifd->pollcache_ = NULL; 1200 goto _prr_out; 1201 } else { 1202 /* mode == CLASSQDQ_POLL || pollcache == NULL */ 1203 ifd->pollcache_ = NULL; 1204 ifd->borrowed_[ifd->qi_] = NULL; 1205 } 1206#ifdef ADJUST_CUTOFF 1207_again: 1208#endif 1209 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 1210 if (ifd->na_[cpri] == 0) 1211 continue; 1212 cl = ifd->active_[cpri]; 1213 VERIFY(cl != NULL); 1214 do { 1215 if (!qempty(&cl->q_)) { 1216 if ((cl->undertime_.tv_sec == 0) || 1217 rmc_under_limit(cl, &now)) 1218 goto _prr_out; 1219 if (first == NULL && cl->borrow_ != NULL) 1220 first = cl; 1221 } 1222 cl = cl->peer_; 1223 } while (cl != ifd->active_[cpri]); 1224 } 1225 1226#ifdef ADJUST_CUTOFF 1227 /* 1228 * no underlimit class found. if cutoff is taking effect, increase 1229 * cutoff and try again. 1230 */ 1231 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1232 ifd->cutoff_++; 1233 goto _again; 1234 } 1235#endif /* ADJUST_CUTOFF */ 1236 /* 1237 * If LINK_EFFICIENCY is turned on, then the first overlimit 1238 * class we encounter will send a packet if all the classes 1239 * of the link-sharing structure are overlimit. 1240 */ 1241 reset_cutoff(ifd); 1242 if (!ifd->efficient_ || first == NULL) 1243 return (NULL); 1244 1245 cl = first; 1246 cpri = cl->pri_; 1247#if 0 /* too time-consuming for nothing */ 1248 if (cl->sleeping_) 1249 CALLOUT_STOP(&cl->callout_); 1250 cl->sleeping_ = 0; 1251 cl->undertime_.tv_sec = 0; 1252#endif 1253 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1254 ifd->cutoff_ = cl->borrow_->depth_; 1255 1256 /* 1257 * Deque the packet and do the book keeping... 1258 */ 1259_prr_out: 1260 if (op == CLASSQDQ_REMOVE) { 1261 m = _rmc_getq(cl); 1262 if (m == NULL) 1263 return (NULL); 1264 1265 if (qempty(&cl->q_)) 1266 ifd->na_[cpri]--; 1267 1268 ifd->active_[cpri] = cl->peer_; 1269 1270 ifd->class_[ifd->qi_] = cl; 1271 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1272 ifd->now_[ifd->qi_] = now; 1273 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1274 ifd->queued_++; 1275 } else { 1276 /* mode == CLASSQDQ_POLL */ 1277 m = _rmc_pollq(cl); 1278 ifd->pollcache_ = cl; 1279 } 1280 return (m); 1281} 1282 1283/* 1284 * struct mbuf * 1285 * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function 1286 * is invoked by the packet driver to get the next packet to be 1287 * dequeued and output on the link. If WRR is enabled, then the 1288 * WRR dequeue next routine will determine the next packet to sent. 1289 * Otherwise, packet-by-packet round robin is invoked. 1290 * 1291 * Returns: NULL, if a packet is not available or if all 1292 * classes are overlimit. 1293 * 1294 * Otherwise, Pointer to the next packet. 1295 */ 1296 1297struct mbuf * 1298rmc_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t mode) 1299{ 1300 if (ifd->queued_ >= ifd->maxqueued_) 1301 return (NULL); 1302 else if (ifd->wrr_) 1303 return (_rmc_wrr_dequeue_next(ifd, mode)); 1304 else 1305 return (_rmc_prr_dequeue_next(ifd, mode)); 1306} 1307 1308/* 1309 * Update the utilization estimate for the packet that just completed. 1310 * The packet's class & the parent(s) of that class all get their 1311 * estimators updated. This routine is called by the driver's output- 1312 * packet-completion interrupt service routine. 1313 */ 1314 1315/* 1316 * a macro to approximate "divide by 1000" that gives 0.000999, 1317 * if a value has enough effective digits. 1318 * (on pentium, mul takes 9 cycles but div takes 46!) 1319 */ 1320#define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) 1321void 1322rmc_update_class_util(struct rm_ifdat *ifd) 1323{ 1324 int idle, avgidle, pktlen; 1325 int pkt_time, tidle; 1326 rm_class_t *cl, *borrowed; 1327 rm_class_t *borrows; 1328 struct timeval *nowp; 1329 1330 /* 1331 * Get the most recent completed class. 1332 */ 1333 if ((cl = ifd->class_[ifd->qo_]) == NULL) 1334 return; 1335 1336 pktlen = ifd->curlen_[ifd->qo_]; 1337 borrowed = ifd->borrowed_[ifd->qo_]; 1338 borrows = borrowed; 1339 1340 PKTCNTR_ADD(&cl->stats_.xmit_cnt, 1, pktlen); 1341 1342 /* 1343 * Run estimator on class and its ancestors. 1344 */ 1345 /* 1346 * rm_update_class_util is designed to be called when the 1347 * transfer is completed from a xmit complete interrupt, 1348 * but most drivers don't implement an upcall for that. 1349 * so, just use estimated completion time. 1350 * as a result, ifd->qi_ and ifd->qo_ are always synced. 1351 */ 1352 nowp = &ifd->now_[ifd->qo_]; 1353 /* get pkt_time (for link) in usec */ 1354#if 1 /* use approximation */ 1355 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; 1356 pkt_time = NSEC_TO_USEC(pkt_time); 1357#else 1358 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; 1359#endif 1360#if 1 /* ALTQ4PPP */ 1361 if (TV_LT(nowp, &ifd->ifnow_)) { 1362 int iftime; 1363 1364 /* 1365 * make sure the estimated completion time does not go 1366 * too far. it can happen when the link layer supports 1367 * data compression or the interface speed is set to 1368 * a much lower value. 1369 */ 1370 TV_DELTA(&ifd->ifnow_, nowp, iftime); 1371 if (iftime+pkt_time < ifd->maxiftime_) { 1372 TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1373 } else { 1374 TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); 1375 } 1376 } else { 1377 TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1378 } 1379#else 1380 if (TV_LT(nowp, &ifd->ifnow_)) { 1381 TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1382 } else { 1383 TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1384 } 1385#endif 1386 1387 while (cl != NULL) { 1388 TV_DELTA(&ifd->ifnow_, &cl->last_, idle); 1389 if (idle >= 2000000) 1390 /* 1391 * this class is idle enough, reset avgidle. 1392 * (TV_DELTA returns 2000000 us when delta is large.) 1393 */ 1394 cl->avgidle_ = cl->maxidle_; 1395 1396 /* get pkt_time (for class) in usec */ 1397#if 1 /* use approximation */ 1398 pkt_time = pktlen * cl->ns_per_byte_; 1399 pkt_time = NSEC_TO_USEC(pkt_time); 1400#else 1401 pkt_time = pktlen * cl->ns_per_byte_ / 1000; 1402#endif 1403 idle -= pkt_time; 1404 1405 avgidle = cl->avgidle_; 1406 avgidle += idle - (avgidle >> RM_FILTER_GAIN); 1407 cl->avgidle_ = avgidle; 1408 1409 /* Are we overlimit ? */ 1410 if (avgidle <= 0) { 1411 CBQTRACE(rmc_update_class_util, 'milo', 1412 cl->stats_.handle); 1413 /* 1414 * need some lower bound for avgidle, otherwise 1415 * a borrowing class gets unbounded penalty. 1416 */ 1417 if (avgidle < cl->minidle_) 1418 avgidle = cl->avgidle_ = cl->minidle_; 1419 1420 /* set next idle to make avgidle 0 */ 1421 tidle = pkt_time + 1422 (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); 1423 TV_ADD_DELTA(nowp, tidle, &cl->undertime_); 1424 ++cl->stats_.over; 1425 } else { 1426 cl->avgidle_ = 1427 (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; 1428 cl->undertime_.tv_sec = 0; 1429 if (cl->sleeping_) { 1430 CALLOUT_STOP(&cl->callout_); 1431 cl->sleeping_ = 0; 1432 } 1433 } 1434 1435 if (borrows != NULL) { 1436 if (borrows != cl) 1437 ++cl->stats_.borrows; 1438 else 1439 borrows = NULL; 1440 } 1441 cl->last_ = ifd->ifnow_; 1442 cl->last_pkttime_ = pkt_time; 1443 1444#if 1 1445 if (cl->parent_ == NULL) { 1446 /* take stats of root class */ 1447 PKTCNTR_ADD(&cl->stats_.xmit_cnt, 1, pktlen); 1448 } 1449#endif 1450 1451 cl = cl->parent_; 1452 } 1453 1454 /* 1455 * Check to see if cutoff needs to set to a new level. 1456 */ 1457 cl = ifd->class_[ifd->qo_]; 1458 if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { 1459 if ((qlen(&cl->q_) <= 0) || 1460 TV_LT(nowp, &borrowed->undertime_)) { 1461 rmc_tl_satisfied(ifd, nowp); 1462 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1463 } else { 1464 ifd->cutoff_ = borrowed->depth_; 1465 CBQTRACE(rmc_update_class_util, 'ffob', 1466 borrowed->depth_); 1467 } 1468 } 1469 1470 /* 1471 * Release class slot 1472 */ 1473 ifd->borrowed_[ifd->qo_] = NULL; 1474 ifd->class_[ifd->qo_] = NULL; 1475 ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; 1476 ifd->queued_--; 1477} 1478 1479/* 1480 * void 1481 * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) 1482 * over-limit action routines. These get invoked by rmc_under_limit() 1483 * if a class with packets to send if over its bandwidth limit & can't 1484 * borrow from a parent class. 1485 * 1486 * Returns: NONE 1487 */ 1488 1489static void 1490rmc_drop_action(struct rm_class *cl) 1491{ 1492 struct rm_ifdat *ifd = cl->ifdat_; 1493 1494 VERIFY(qlen(&cl->q_) > 0); 1495 IFCQ_CONVERT_LOCK(ifd->ifq_); 1496 _rmc_dropq(cl); 1497 if (qempty(&cl->q_)) 1498 ifd->na_[cl->pri_]--; 1499} 1500 1501void 1502rmc_drop(struct rm_class *cl, u_int32_t flow, u_int32_t *packets, 1503 u_int32_t *bytes) 1504{ 1505 struct rm_ifdat *ifd = cl->ifdat_; 1506 struct ifclassq *ifq = ifd->ifq_; 1507 u_int32_t pkt = 0, len = 0, qlen; 1508 1509 if ((qlen = qlen(&cl->q_)) != 0) { 1510 IFCQ_CONVERT_LOCK(ifq); 1511#if CLASSQ_RIO 1512 if (q_is_rio(&cl->q_)) 1513 rio_purgeq(cl->rio_, &cl->q_, flow, &pkt, &len); 1514 else 1515#endif /* CLASSQ_RIO */ 1516#if CLASSQ_RED 1517 if (q_is_red(&cl->q_)) 1518 red_purgeq(cl->red_, &cl->q_, flow, &pkt, &len); 1519 else 1520#endif /* CLASSQ_RED */ 1521#if CLASSQ_BLUE 1522 if (q_is_blue(&cl->q_)) 1523 blue_purgeq(cl->blue_, &cl->q_, flow, &pkt, &len); 1524 else 1525#endif /* CLASSQ_BLUE */ 1526 if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) 1527 sfb_purgeq(cl->sfb_, &cl->q_, flow, &pkt, &len); 1528 else 1529 _flushq_flow(&cl->q_, flow, &pkt, &len); 1530 1531 if (pkt > 0) { 1532 VERIFY(qlen(&cl->q_) == (qlen - pkt)); 1533 1534 PKTCNTR_ADD(&cl->stats_.drop_cnt, pkt, len); 1535 IFCQ_DROP_ADD(ifq, pkt, len); 1536 1537 VERIFY(((signed)IFCQ_LEN(ifq) - pkt) >= 0); 1538 IFCQ_LEN(ifq) -= pkt; 1539 1540 if (qempty(&cl->q_)) 1541 ifd->na_[cl->pri_]--; 1542 } 1543 } 1544 if (packets != NULL) 1545 *packets = pkt; 1546 if (bytes != NULL) 1547 *bytes = len; 1548} 1549 1550void 1551rmc_dropall(struct rm_class *cl) 1552{ 1553 rmc_drop(cl, 0, NULL, NULL); 1554} 1555 1556/* 1557 * void 1558 * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ 1559 * delay action routine. It is invoked via rmc_under_limit when the 1560 * packet is discoverd to be overlimit. 1561 * 1562 * If the delay action is result of borrow class being overlimit, then 1563 * delay for the offtime of the borrowing class that is overlimit. 1564 * 1565 * Returns: NONE 1566 */ 1567 1568void 1569rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) 1570{ 1571 int ndelay, t, extradelay; 1572 1573 cl->stats_.overactions++; 1574 TV_DELTA(&cl->undertime_, &cl->overtime_, ndelay); 1575#ifndef BORROW_OFFTIME 1576 ndelay += cl->offtime_; 1577#endif 1578 1579 if (!cl->sleeping_) { 1580 CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); 1581#ifdef BORROW_OFFTIME 1582 if (borrow != NULL) 1583 extradelay = borrow->offtime_; 1584 else 1585#endif 1586 extradelay = cl->offtime_; 1587 1588 /* 1589 * XXX recalculate suspend time: 1590 * current undertime is (tidle + pkt_time) calculated 1591 * from the last transmission. 1592 * tidle: time required to bring avgidle back to 0 1593 * pkt_time: target waiting time for this class 1594 * we need to replace pkt_time by offtime 1595 */ 1596 extradelay -= cl->last_pkttime_; 1597 if (extradelay > 0) { 1598 TV_ADD_DELTA(&cl->undertime_, extradelay, 1599 &cl->undertime_); 1600 ndelay += extradelay; 1601 } 1602 1603 cl->sleeping_ = 1; 1604 cl->stats_.delays++; 1605 1606 /* 1607 * Since packets are phased randomly with respect to the 1608 * clock, 1 tick (the next clock tick) can be an arbitrarily 1609 * short time so we have to wait for at least two ticks. 1610 * NOTE: If there's no other traffic, we need the timer as 1611 * a 'backstop' to restart this class. 1612 */ 1613 if (ndelay > tick * 2) { 1614 /* 1615 * FreeBSD rounds up the tick; 1616 * other BSDs round down the tick. 1617 */ 1618 t = hzto(&cl->undertime_) + 1; 1619 } else { 1620 t = 2; 1621 } 1622 CALLOUT_RESET(&cl->callout_, t, 1623 (timeout_t *)rmc_restart, (caddr_t)cl); 1624 } 1625} 1626 1627/* 1628 * void 1629 * rmc_restart() - is just a helper routine for rmc_delay_action -- it is 1630 * called by the system timer code & is responsible checking if the 1631 * class is still sleeping (it might have been restarted as a side 1632 * effect of the queue scan on a packet arrival) and, if so, restarting 1633 * output for the class. Inspecting the class state & restarting output 1634 * require locking the class structure. In general the driver is 1635 * responsible for locking but this is the only routine that is not 1636 * called directly or indirectly from the interface driver so it has 1637 * know about system locking conventions. 1638 * 1639 * Returns: NONE 1640 */ 1641 1642static void 1643rmc_restart(struct rm_class *cl) 1644{ 1645 struct rm_ifdat *ifd = cl->ifdat_; 1646 1647 if (cl->sleeping_) { 1648 cl->sleeping_ = 0; 1649 cl->undertime_.tv_sec = 0; 1650 1651 if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { 1652 CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); 1653 (ifd->restart)(ifd->ifq_); 1654 } 1655 } 1656} 1657 1658/* 1659 * void 1660 * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit 1661 * handling routine for the root class of the link sharing structure. 1662 * 1663 * Returns: NONE 1664 */ 1665static void 1666rmc_root_overlimit(struct rm_class *cl, 1667 struct rm_class *borrow) 1668{ 1669#pragma unused(cl, borrow) 1670 panic("rmc_root_overlimit"); 1671} 1672 1673/* 1674 * Packet Queue handling routines. Eventually, this is to localize the 1675 * effects on the code whether queues are red queues or droptail 1676 * queues. 1677 */ 1678 1679static int 1680_rmc_addq(rm_class_t *cl, struct mbuf *m, struct pf_mtag *t) 1681{ 1682#if CLASSQ_RIO 1683 if (q_is_rio(&cl->q_)) 1684 return (rio_addq(cl->rio_, &cl->q_, m, t)); 1685 else 1686#endif /* CLASSQ_RIO */ 1687#if CLASSQ_RED 1688 if (q_is_red(&cl->q_)) 1689 return (red_addq(cl->red_, &cl->q_, m, t)); 1690 else 1691#endif /* CLASSQ_RED */ 1692#if CLASSQ_BLUE 1693 if (q_is_blue(&cl->q_)) 1694 return (blue_addq(cl->blue_, &cl->q_, m, t)); 1695 else 1696#endif /* CLASSQ_BLUE */ 1697 if (q_is_sfb(&cl->q_)) { 1698 if (cl->sfb_ == NULL) { 1699 struct ifclassq *ifq = cl->ifdat_->ifq_; 1700 struct ifnet *ifp = ifq->ifcq_ifp; 1701 1702 VERIFY(cl->flags_ & RMCF_LAZY); 1703 IFCQ_CONVERT_LOCK(ifq); 1704 1705 cl->sfb_ = sfb_alloc(ifp, cl->stats_.handle, 1706 qlimit(&cl->q_), cl->qflags_); 1707 if (cl->sfb_ == NULL) { 1708 /* fall back to droptail */ 1709 qtype(&cl->q_) = Q_DROPTAIL; 1710 cl->flags_ &= ~RMCF_SFB; 1711 cl->qflags_ &= ~(SFBF_ECN | SFBF_FLOWCTL); 1712 1713 log(LOG_ERR, "%s: CBQ SFB lazy allocation " 1714 "failed for qid=%d pri=%d, falling back " 1715 "to DROPTAIL\n", if_name(ifp), 1716 cl->stats_.handle, cl->pri_); 1717 } 1718 } 1719 if (cl->sfb_ != NULL) 1720 return (sfb_addq(cl->sfb_, &cl->q_, m, t)); 1721 } 1722#if PF_ECN 1723 else if (cl->flags_ & RMCF_CLEARDSCP) 1724 write_dsfield(m, t, 0); 1725#endif /* PF_ECN */ 1726 1727 /* test for qlen > qlimit is done by caller */ 1728 _addq(&cl->q_, m); 1729 return (0); 1730} 1731 1732/* note: _rmc_dropq is not called for red */ 1733static void 1734_rmc_dropq(rm_class_t *cl) 1735{ 1736 struct mbuf *m; 1737 1738 if ((m = _rmc_getq(cl)) != NULL) 1739 m_freem(m); 1740} 1741 1742static struct mbuf * 1743_rmc_getq(rm_class_t *cl) 1744{ 1745#if CLASSQ_RIO 1746 if (q_is_rio(&cl->q_)) 1747 return (rio_getq(cl->rio_, &cl->q_)); 1748 else 1749#endif /* CLASSQ_RIO */ 1750#if CLASSQ_RED 1751 if (q_is_red(&cl->q_)) 1752 return (red_getq(cl->red_, &cl->q_)); 1753 else 1754#endif /* CLASSQ_RED */ 1755#if CLASSQ_BLUE 1756 if (q_is_blue(&cl->q_)) 1757 return (blue_getq(cl->blue_, &cl->q_)); 1758 else 1759#endif /* CLASSQ_BLUE */ 1760 if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) 1761 return (sfb_getq(cl->sfb_, &cl->q_)); 1762 1763 return (_getq(&cl->q_)); 1764} 1765 1766static struct mbuf * 1767_rmc_pollq(rm_class_t *cl) 1768{ 1769 return (qhead(&cl->q_)); 1770} 1771 1772void 1773rmc_updateq(rm_class_t *cl, cqev_t ev) 1774{ 1775#if CLASSQ_RIO 1776 if (q_is_rio(&cl->q_)) 1777 return (rio_updateq(cl->rio_, ev)); 1778#endif /* CLASSQ_RIO */ 1779#if CLASSQ_RED 1780 if (q_is_red(&cl->q_)) 1781 return (red_updateq(cl->red_, ev)); 1782#endif /* CLASSQ_RED */ 1783#if CLASSQ_BLUE 1784 if (q_is_blue(&cl->q_)) 1785 return (blue_updateq(cl->blue_, ev)); 1786#endif /* CLASSQ_BLUE */ 1787 if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL) 1788 return (sfb_updateq(cl->sfb_, ev)); 1789} 1790 1791#ifdef CBQ_TRACE 1792 1793struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; 1794struct cbqtrace *cbqtrace_ptr = NULL; 1795int cbqtrace_count; 1796 1797/* 1798 * DDB hook to trace cbq events: 1799 * the last 1024 events are held in a circular buffer. 1800 * use "call cbqtrace_dump(N)" to display 20 events from Nth event. 1801 */ 1802void cbqtrace_dump(int); 1803static char *rmc_funcname(void *); 1804 1805static struct rmc_funcs { 1806 void *func; 1807 char *name; 1808} rmc_funcs[] = 1809{ 1810 rmc_init, "rmc_init", 1811 rmc_queue_packet, "rmc_queue_packet", 1812 rmc_under_limit, "rmc_under_limit", 1813 rmc_update_class_util, "rmc_update_class_util", 1814 rmc_delay_action, "rmc_delay_action", 1815 rmc_restart, "rmc_restart", 1816 _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", 1817 NULL, NULL 1818}; 1819 1820static char * 1821rmc_funcname(void *func) 1822{ 1823 struct rmc_funcs *fp; 1824 1825 for (fp = rmc_funcs; fp->func != NULL; fp++) 1826 if (fp->func == func) 1827 return (fp->name); 1828 return ("unknown"); 1829} 1830 1831void 1832cbqtrace_dump(int counter) 1833{ 1834 int i, *p; 1835 char *cp; 1836 1837 counter = counter % NCBQTRACE; 1838 p = (int *)&cbqtrace_buffer[counter]; 1839 1840 for (i = 0; i < 20; i++) { 1841 log(LOG_DEBUG, "[0x%x] ", *p++); 1842 log(LOG_DEBUG, "%s: ", rmc_funcname((void *)*p++)); 1843 cp = (char *)p++; 1844 log(LOG_DEBUG, "%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); 1845 log(LOG_DEBUG, "%d\n", *p++); 1846 1847 if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) 1848 p = (int *)cbqtrace_buffer; 1849 } 1850} 1851#endif /* CBQ_TRACE */ 1852#endif /* PKTSCHED_CBQ */ 1853