1/* 2 * Copyright (c) 2007-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* $OpenBSD: altq_red.c,v 1.14 2007/09/13 20:40:02 chl Exp $ */ 30/* $KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $ */ 31 32/* 33 * Copyright (C) 1997-2003 34 * Sony Computer Science Laboratories Inc. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 */ 58/* 59 * Copyright (c) 1990-1994 Regents of the University of California. 60 * All rights reserved. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 3. All advertising materials mentioning features or use of this software 71 * must display the following acknowledgement: 72 * This product includes software developed by the Computer Systems 73 * Engineering Group at Lawrence Berkeley Laboratory. 74 * 4. Neither the name of the University nor of the Laboratory may be used 75 * to endorse or promote products derived from this software without 76 * specific prior written permission. 77 * 78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 88 * SUCH DAMAGE. 89 */ 90 91#include <sys/cdefs.h> 92 93#if CLASSQ_RED 94 95#include <sys/param.h> 96#include <sys/malloc.h> 97#include <sys/mbuf.h> 98#include <sys/socket.h> 99#include <sys/systm.h> 100#include <sys/errno.h> 101#include <sys/kauth.h> 102 103#include <kern/zalloc.h> 104 105#include <net/if.h> 106 107#include <netinet/in.h> 108#include <netinet/in_systm.h> 109#include <netinet/ip.h> 110#if INET6 111#include <netinet/ip6.h> 112#endif 113 114#include <net/classq/classq_red.h> 115 116/* 117 * ALTQ/RED (Random Early Detection) implementation using 32-bit 118 * fixed-point calculation. 119 * 120 * written by kjc using the ns code as a reference. 121 * you can learn more about red and ns from Sally's home page at 122 * http://www-nrg.ee.lbl.gov/floyd/ 123 * 124 * most of the red parameter values are fixed in this implementation 125 * to prevent fixed-point overflow/underflow. 126 * if you change the parameters, watch out for overflow/underflow! 127 * 128 * the parameters used are recommended values by Sally. 129 * the corresponding ns config looks: 130 * q_weight=0.00195 131 * minthresh=5 maxthresh=15 queue-size=60 132 * linterm=30 133 * dropmech=drop-tail 134 * bytes=false (can't be handled by 32-bit fixed-point) 135 * doubleq=false dqthresh=false 136 * wait=true 137 */ 138/* 139 * alternative red parameters for a slow link. 140 * 141 * assume the queue length becomes from zero to L and keeps L, it takes 142 * N packets for q_avg to reach 63% of L. 143 * when q_weight is 0.002, N is about 500 packets. 144 * for a slow link like dial-up, 500 packets takes more than 1 minute! 145 * when q_weight is 0.008, N is about 127 packets. 146 * when q_weight is 0.016, N is about 63 packets. 147 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets 148 * are allowed for 0.016. 149 * see Sally's paper for more details. 150 */ 151/* normal red parameters */ 152#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ 153 /* q_weight = 0.00195 */ 154 155/* red parameters for a slow link */ 156#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ 157 /* q_weight = 0.0078125 */ 158 159/* red parameters for a very slow link (e.g., dialup) */ 160#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ 161 /* q_weight = 0.015625 */ 162 163/* fixed-point uses 12-bit decimal places */ 164#define FP_SHIFT 12 /* fixed-point shift */ 165 166/* red parameters for drop probability */ 167#define INV_P_MAX 10 /* inverse of max drop probability */ 168#define TH_MIN 5 /* min threshold */ 169#define TH_MAX 15 /* max threshold */ 170 171#define RED_LIMIT 60 /* default max queue lenght */ 172 173#define RED_ZONE_MAX 32 /* maximum elements in zone */ 174#define RED_ZONE_NAME "classq_red" /* zone name */ 175 176static unsigned int red_size; /* size of zone element */ 177static struct zone *red_zone; /* zone for red */ 178 179/* 180 * our default policy for forced-drop is drop-tail. 181 * (in altq-1.1.2 or earlier, the default was random-drop. 182 * but it makes more sense to punish the cause of the surge.) 183 * to switch to the random-drop policy, define "RED_RANDOM_DROP". 184 */ 185 186/* default red parameter values */ 187static int default_th_min = TH_MIN; 188static int default_th_max = TH_MAX; 189static int default_inv_pmax = INV_P_MAX; 190 191static struct mbuf *red_getq_flow(struct red *, class_queue_t *, 192 u_int32_t, boolean_t); 193 194void 195red_init(void) 196{ 197 _CASSERT(REDF_ECN4 == CLASSQF_ECN4); 198 _CASSERT(REDF_ECN6 == CLASSQF_ECN6); 199 200 red_size = sizeof (red_t); 201 red_zone = zinit(red_size, RED_ZONE_MAX * red_size, 202 0, RED_ZONE_NAME); 203 if (red_zone == NULL) { 204 panic("%s: failed allocating %s", __func__, RED_ZONE_NAME); 205 /* NOTREACHED */ 206 } 207 zone_change(red_zone, Z_EXPAND, TRUE); 208 zone_change(red_zone, Z_CALLERACCT, TRUE); 209} 210 211/* 212 * red support routines 213 */ 214red_t * 215red_alloc(struct ifnet *ifp, int weight, int inv_pmax, int th_min, 216 int th_max, int flags, int pkttime) 217{ 218 red_t *rp; 219 int w, i; 220 int npkts_per_sec; 221 222 VERIFY(ifp != NULL); 223 224 rp = zalloc(red_zone); 225 if (rp == NULL) 226 return (NULL); 227 228 bzero(rp, red_size); 229 rp->red_avg = 0; 230 rp->red_idle = 1; 231 232 if (weight == 0) 233 rp->red_weight = W_WEIGHT; 234 else 235 rp->red_weight = weight; 236 if (inv_pmax == 0) 237 rp->red_inv_pmax = default_inv_pmax; 238 else 239 rp->red_inv_pmax = inv_pmax; 240 if (th_min == 0) 241 rp->red_thmin = default_th_min; 242 else 243 rp->red_thmin = th_min; 244 if (th_max == 0) 245 rp->red_thmax = default_th_max; 246 else 247 rp->red_thmax = th_max; 248 249 rp->red_flags = (flags & REDF_USERFLAGS); 250 rp->red_ifp = ifp; 251 252 if (pkttime == 0) 253 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ 254 rp->red_pkttime = 800; 255 else 256 rp->red_pkttime = pkttime; 257 258 if (weight == 0) { 259 /* when the link is very slow, adjust red parameters */ 260 npkts_per_sec = 1000000 / rp->red_pkttime; 261 if (npkts_per_sec < 50) { 262 /* up to about 400Kbps */ 263 rp->red_weight = W_WEIGHT_2; 264 } else if (npkts_per_sec < 300) { 265 /* up to about 2.4Mbps */ 266 rp->red_weight = W_WEIGHT_1; 267 } 268 } 269 270 /* calculate wshift. weight must be power of 2 */ 271 w = rp->red_weight; 272 for (i = 0; w > 1; i++) 273 w = w >> 1; 274 rp->red_wshift = i; 275 w = 1 << rp->red_wshift; 276 if (w != rp->red_weight) { 277 printf("invalid weight value %d for red! use %d\n", 278 rp->red_weight, w); 279 rp->red_weight = w; 280 } 281 282 /* 283 * thmin_s and thmax_s are scaled versions of th_min and th_max 284 * to be compared with avg. 285 */ 286 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); 287 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); 288 289 /* 290 * precompute probability denominator 291 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point 292 */ 293 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) * 294 rp->red_inv_pmax) << FP_SHIFT; 295 296 /* allocate weight table */ 297 rp->red_wtab = wtab_alloc(rp->red_weight); 298 if (rp->red_wtab == NULL) { 299 red_destroy(rp); 300 return (NULL); 301 } 302 303 microuptime(&rp->red_last); 304 return (rp); 305} 306 307void 308red_destroy(red_t *rp) 309{ 310 if (rp->red_wtab != NULL) { 311 wtab_destroy(rp->red_wtab); 312 rp->red_wtab = NULL; 313 } 314 zfree(red_zone, rp); 315} 316 317void 318red_getstats(red_t *rp, struct red_stats *sp) 319{ 320 sp->q_avg = rp->red_avg >> rp->red_wshift; 321 sp->drop_forced = rp->red_stats.drop_forced; 322 sp->drop_unforced = rp->red_stats.drop_unforced; 323 sp->marked_packets = rp->red_stats.marked_packets; 324} 325 326int 327red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct pf_mtag *tag) 328{ 329 int avg, droptype; 330 int n; 331 332 avg = rp->red_avg; 333 334 /* 335 * if we were idle, we pretend that n packets arrived during 336 * the idle period. 337 */ 338 if (rp->red_idle) { 339 struct timeval now; 340 int t; 341 342 rp->red_idle = 0; 343 microuptime(&now); 344 t = (now.tv_sec - rp->red_last.tv_sec); 345 if (t > 60) { 346 /* 347 * being idle for more than 1 minute, set avg to zero. 348 * this prevents t from overflow. 349 */ 350 avg = 0; 351 } else { 352 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); 353 n = t / rp->red_pkttime - 1; 354 355 /* the following line does (avg = (1 - Wq)^n * avg) */ 356 if (n > 0) 357 avg = (avg >> FP_SHIFT) * 358 pow_w(rp->red_wtab, n); 359 } 360 } 361 362 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ 363 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); 364 rp->red_avg = avg; /* save the new value */ 365 366 /* 367 * red_count keeps a tally of arriving traffic that has not 368 * been dropped. 369 */ 370 rp->red_count++; 371 372 /* see if we drop early */ 373 droptype = DTYPE_NODROP; 374 if (avg >= rp->red_thmin_s && qlen(q) > 1) { 375 if (avg >= rp->red_thmax_s) { 376 /* avg >= th_max: forced drop */ 377 droptype = DTYPE_FORCED; 378 } else if (rp->red_old == 0) { 379 /* first exceeds th_min */ 380 rp->red_count = 1; 381 rp->red_old = 1; 382 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, 383 rp->red_probd, rp->red_count)) { 384 /* mark or drop by red */ 385 if ((rp->red_flags & REDF_ECN) && 386 (tag->pftag_flags & PF_TAG_TCP) && /* only TCP */ 387 mark_ecn(m, tag, rp->red_flags)) { 388 /* successfully marked. do not drop. */ 389 rp->red_count = 0; 390 rp->red_stats.marked_packets++; 391 } else { 392 /* unforced drop by red */ 393 droptype = DTYPE_EARLY; 394 } 395 } 396 } else { 397 /* avg < th_min */ 398 rp->red_old = 0; 399 } 400 401 /* 402 * if the queue length hits the hard limit, it's a forced drop. 403 */ 404 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) 405 droptype = DTYPE_FORCED; 406 407#ifdef RED_RANDOM_DROP 408 /* if successful or forced drop, enqueue this packet. */ 409 if (droptype != DTYPE_EARLY) 410 _addq(q, m); 411#else 412 /* if successful, enqueue this packet. */ 413 if (droptype == DTYPE_NODROP) 414 _addq(q, m); 415#endif 416 if (droptype != DTYPE_NODROP) { 417 if (droptype == DTYPE_EARLY) { 418 /* drop the incoming packet */ 419 rp->red_stats.drop_unforced++; 420 } else { 421 /* forced drop, select a victim packet in the queue. */ 422#ifdef RED_RANDOM_DROP 423 m = _getq_random(q); 424#endif 425 rp->red_stats.drop_forced++; 426 } 427 rp->red_count = 0; 428 IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd); 429 m_freem(m); 430 return (CLASSQEQ_DROPPED); 431 } 432 /* successfully queued */ 433 return (CLASSQEQ_SUCCESS); 434} 435 436/* 437 * early-drop probability is calculated as follows: 438 * prob = p_max * (avg - th_min) / (th_max - th_min) 439 * prob_a = prob / (2 - count*prob) 440 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) 441 * here prob_a increases as successive undrop count increases. 442 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), 443 * becomes 1 when (count >= (2 / prob))). 444 */ 445int 446drop_early(int fp_len, int fp_probd, int count) 447{ 448 int d; /* denominator of drop-probability */ 449 450 d = fp_probd - count * fp_len; 451 if (d <= 0) 452 /* count exceeds the hard limit: drop or mark */ 453 return (1); 454 455 /* 456 * now the range of d is [1..600] in fixed-point. (when 457 * th_max-th_min=10 and p_max=1/30) 458 * drop probability = (avg - TH_MIN) / d 459 */ 460 461 if ((random() % d) < (unsigned)fp_len) { 462 /* drop or mark */ 463 return (1); 464 } 465 /* no drop/mark */ 466 return (0); 467} 468 469static struct mbuf * 470red_getq_flow(struct red *rp, class_queue_t *q, u_int32_t flow, boolean_t purge) 471{ 472#pragma unused(purge) 473 struct mbuf *m; 474 475 /* flow of 0 means head of queue */ 476 if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) { 477 if (rp->red_idle == 0) { 478 rp->red_idle = 1; 479 microuptime(&rp->red_last); 480 } 481 return (NULL); 482 } 483 484 rp->red_idle = 0; 485 return (m); 486} 487 488struct mbuf * 489red_getq(red_t *rp, class_queue_t *q) 490{ 491 return (red_getq_flow(rp, q, 0, FALSE)); 492} 493 494void 495red_purgeq(struct red *rp, class_queue_t *q, u_int32_t flow, u_int32_t *packets, 496 u_int32_t *bytes) 497{ 498 u_int32_t cnt = 0, len = 0; 499 struct mbuf *m; 500 501 IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd); 502 503 while ((m = red_getq_flow(rp, q, flow, TRUE)) != NULL) { 504 cnt++; 505 len += m_pktlen(m); 506 m_freem(m); 507 } 508 509 if (packets != NULL) 510 *packets = cnt; 511 if (bytes != NULL) 512 *bytes = len; 513} 514 515void 516red_updateq(red_t *rp, cqev_t ev) 517{ 518#pragma unused(rp, ev) 519 /* nothing for now */ 520} 521 522int 523red_suspendq(red_t *rp, class_queue_t *q, boolean_t on) 524{ 525#pragma unused(rp, q, on) 526 return (ENOTSUP); 527} 528 529/* 530 * helper routine to calibrate avg during idle. 531 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point 532 * here Wq = 1/weight and the code assumes Wq is close to zero. 533 * 534 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. 535 */ 536static struct wtab *wtab_list = NULL; /* pointer to wtab list */ 537 538struct wtab * 539wtab_alloc(int weight) 540{ 541 struct wtab *w; 542 int i; 543 544 for (w = wtab_list; w != NULL; w = w->w_next) 545 if (w->w_weight == weight) { 546 w->w_refcount++; 547 return (w); 548 } 549 550 w = _MALLOC(sizeof (struct wtab), M_DEVBUF, M_WAITOK|M_ZERO); 551 if (w == NULL) 552 return (NULL); 553 554 w->w_weight = weight; 555 w->w_refcount = 1; 556 w->w_next = wtab_list; 557 wtab_list = w; 558 559 /* initialize the weight table */ 560 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; 561 for (i = 1; i < 32; i++) { 562 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; 563 if (w->w_tab[i] == 0 && w->w_param_max == 0) 564 w->w_param_max = 1 << i; 565 } 566 567 return (w); 568} 569 570void 571wtab_destroy(struct wtab *w) 572{ 573 struct wtab *prev; 574 575 if (--w->w_refcount > 0) 576 return; 577 578 if (wtab_list == w) 579 wtab_list = w->w_next; 580 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) 581 if (prev->w_next == w) { 582 prev->w_next = w->w_next; 583 break; 584 } 585 586 _FREE(w, M_DEVBUF); 587} 588 589int32_t 590pow_w(struct wtab *w, int n) 591{ 592 int i, bit; 593 int32_t val; 594 595 if (n >= w->w_param_max) 596 return (0); 597 598 val = 1 << FP_SHIFT; 599 if (n <= 0) 600 return (val); 601 602 bit = 1; 603 i = 0; 604 while (n) { 605 if (n & bit) { 606 val = (val * w->w_tab[i]) >> FP_SHIFT; 607 n &= ~bit; 608 } 609 i++; 610 bit <<= 1; 611 } 612 return (val); 613} 614 615#endif /* CLASSQ_RED */ 616