1/* 2 * Copyright (c) 2007-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* $OpenBSD: altq_red.c,v 1.14 2007/09/13 20:40:02 chl Exp $ */ 30/* $KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $ */ 31 32/* 33 * Copyright (C) 1997-2003 34 * Sony Computer Science Laboratories Inc. All rights reserved. 35 * 36 * Redistribution and use in source and binary forms, with or without 37 * modification, are permitted provided that the following conditions 38 * are met: 39 * 1. Redistributions of source code must retain the above copyright 40 * notice, this list of conditions and the following disclaimer. 41 * 2. Redistributions in binary form must reproduce the above copyright 42 * notice, this list of conditions and the following disclaimer in the 43 * documentation and/or other materials provided with the distribution. 44 * 45 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND 46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE 49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 55 * SUCH DAMAGE. 56 * 57 */ 58/* 59 * Copyright (c) 1990-1994 Regents of the University of California. 60 * All rights reserved. 61 * 62 * Redistribution and use in source and binary forms, with or without 63 * modification, are permitted provided that the following conditions 64 * are met: 65 * 1. Redistributions of source code must retain the above copyright 66 * notice, this list of conditions and the following disclaimer. 67 * 2. Redistributions in binary form must reproduce the above copyright 68 * notice, this list of conditions and the following disclaimer in the 69 * documentation and/or other materials provided with the distribution. 70 * 3. All advertising materials mentioning features or use of this software 71 * must display the following acknowledgement: 72 * This product includes software developed by the Computer Systems 73 * Engineering Group at Lawrence Berkeley Laboratory. 74 * 4. Neither the name of the University nor of the Laboratory may be used 75 * to endorse or promote products derived from this software without 76 * specific prior written permission. 77 * 78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 81 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 88 * SUCH DAMAGE. 89 */ 90 91#include <sys/cdefs.h> 92 93#if CLASSQ_RED 94 95#include <sys/param.h> 96#include <sys/malloc.h> 97#include <sys/mbuf.h> 98#include <sys/socket.h> 99#include <sys/systm.h> 100#include <sys/syslog.h> 101#include <sys/errno.h> 102#include <sys/kauth.h> 103#include <dev/random/randomdev.h> 104#include <kern/zalloc.h> 105 106#include <net/if.h> 107 108#include <netinet/in.h> 109#include <netinet/in_systm.h> 110#include <netinet/ip.h> 111#if INET6 112#include <netinet/ip6.h> 113#endif 114 115#include <net/classq/classq_red.h> 116#include <net/net_osdep.h> 117 118/* 119 * ALTQ/RED (Random Early Detection) implementation using 32-bit 120 * fixed-point calculation. 121 * 122 * written by kjc using the ns code as a reference. 123 * you can learn more about red and ns from Sally's home page at 124 * http://www-nrg.ee.lbl.gov/floyd/ 125 * 126 * most of the red parameter values are fixed in this implementation 127 * to prevent fixed-point overflow/underflow. 128 * if you change the parameters, watch out for overflow/underflow! 129 * 130 * the parameters used are recommended values by Sally. 131 * the corresponding ns config looks: 132 * q_weight=0.00195 133 * minthresh=5 maxthresh=15 queue-size=60 134 * linterm=30 135 * dropmech=drop-tail 136 * bytes=false (can't be handled by 32-bit fixed-point) 137 * doubleq=false dqthresh=false 138 * wait=true 139 */ 140/* 141 * alternative red parameters for a slow link. 142 * 143 * assume the queue length becomes from zero to L and keeps L, it takes 144 * N packets for q_avg to reach 63% of L. 145 * when q_weight is 0.002, N is about 500 packets. 146 * for a slow link like dial-up, 500 packets takes more than 1 minute! 147 * when q_weight is 0.008, N is about 127 packets. 148 * when q_weight is 0.016, N is about 63 packets. 149 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets 150 * are allowed for 0.016. 151 * see Sally's paper for more details. 152 */ 153/* normal red parameters */ 154#define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ 155 /* q_weight = 0.00195 */ 156 157/* red parameters for a slow link */ 158#define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ 159 /* q_weight = 0.0078125 */ 160 161/* red parameters for a very slow link (e.g., dialup) */ 162#define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ 163 /* q_weight = 0.015625 */ 164 165/* fixed-point uses 12-bit decimal places */ 166#define FP_SHIFT 12 /* fixed-point shift */ 167 168/* red parameters for drop probability */ 169#define INV_P_MAX 10 /* inverse of max drop probability */ 170#define TH_MIN 5 /* min threshold */ 171#define TH_MAX 15 /* max threshold */ 172 173#define RED_LIMIT 60 /* default max queue lenght */ 174 175#define RED_ZONE_MAX 32 /* maximum elements in zone */ 176#define RED_ZONE_NAME "classq_red" /* zone name */ 177 178static unsigned int red_size; /* size of zone element */ 179static struct zone *red_zone; /* zone for red */ 180 181/* 182 * our default policy for forced-drop is drop-tail. 183 * (in altq-1.1.2 or earlier, the default was random-drop. 184 * but it makes more sense to punish the cause of the surge.) 185 * to switch to the random-drop policy, define "RED_RANDOM_DROP". 186 */ 187 188/* default red parameter values */ 189static int default_th_min = TH_MIN; 190static int default_th_max = TH_MAX; 191static int default_inv_pmax = INV_P_MAX; 192 193static struct mbuf *red_getq_flow(struct red *, class_queue_t *, 194 u_int32_t, boolean_t); 195 196void 197red_init(void) 198{ 199 _CASSERT(REDF_ECN4 == CLASSQF_ECN4); 200 _CASSERT(REDF_ECN6 == CLASSQF_ECN6); 201 202 red_size = sizeof (red_t); 203 red_zone = zinit(red_size, RED_ZONE_MAX * red_size, 204 0, RED_ZONE_NAME); 205 if (red_zone == NULL) { 206 panic("%s: failed allocating %s", __func__, RED_ZONE_NAME); 207 /* NOTREACHED */ 208 } 209 zone_change(red_zone, Z_EXPAND, TRUE); 210 zone_change(red_zone, Z_CALLERACCT, TRUE); 211} 212 213/* 214 * red support routines 215 */ 216red_t * 217red_alloc(struct ifnet *ifp, int weight, int inv_pmax, int th_min, 218 int th_max, int flags, int pkttime) 219{ 220 red_t *rp; 221 int w, i; 222 int npkts_per_sec; 223 224 VERIFY(ifp != NULL); 225 226 rp = zalloc(red_zone); 227 if (rp == NULL) 228 return (NULL); 229 230 bzero(rp, red_size); 231 rp->red_avg = 0; 232 rp->red_idle = 1; 233 234 if (weight == 0) 235 rp->red_weight = W_WEIGHT; 236 else 237 rp->red_weight = weight; 238 if (inv_pmax == 0) 239 rp->red_inv_pmax = default_inv_pmax; 240 else 241 rp->red_inv_pmax = inv_pmax; 242 if (th_min == 0) 243 rp->red_thmin = default_th_min; 244 else 245 rp->red_thmin = th_min; 246 if (th_max == 0) 247 rp->red_thmax = default_th_max; 248 else 249 rp->red_thmax = th_max; 250 251 rp->red_ifp = ifp; 252 rp->red_flags = (flags & REDF_USERFLAGS); 253#if !PF_ECN 254 if (rp->red_flags & REDF_ECN) { 255 rp->red_flags &= ~REDF_ECN; 256 log(LOG_ERR, "%s: RED ECN not available; ignoring " 257 "REDF_ECN flag!\n", if_name(ifp)); 258 } 259#endif /* !PF_ECN */ 260 261 if (pkttime == 0) 262 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ 263 rp->red_pkttime = 800; 264 else 265 rp->red_pkttime = pkttime; 266 267 if (weight == 0) { 268 /* when the link is very slow, adjust red parameters */ 269 npkts_per_sec = 1000000 / rp->red_pkttime; 270 if (npkts_per_sec < 50) { 271 /* up to about 400Kbps */ 272 rp->red_weight = W_WEIGHT_2; 273 } else if (npkts_per_sec < 300) { 274 /* up to about 2.4Mbps */ 275 rp->red_weight = W_WEIGHT_1; 276 } 277 } 278 279 /* calculate wshift. weight must be power of 2 */ 280 w = rp->red_weight; 281 for (i = 0; w > 1; i++) 282 w = w >> 1; 283 rp->red_wshift = i; 284 w = 1 << rp->red_wshift; 285 if (w != rp->red_weight) { 286 printf("invalid weight value %d for red! use %d\n", 287 rp->red_weight, w); 288 rp->red_weight = w; 289 } 290 291 /* 292 * thmin_s and thmax_s are scaled versions of th_min and th_max 293 * to be compared with avg. 294 */ 295 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); 296 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); 297 298 /* 299 * precompute probability denominator 300 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point 301 */ 302 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) * 303 rp->red_inv_pmax) << FP_SHIFT; 304 305 /* allocate weight table */ 306 rp->red_wtab = wtab_alloc(rp->red_weight); 307 if (rp->red_wtab == NULL) { 308 red_destroy(rp); 309 return (NULL); 310 } 311 312 microuptime(&rp->red_last); 313 return (rp); 314} 315 316void 317red_destroy(red_t *rp) 318{ 319 if (rp->red_wtab != NULL) { 320 wtab_destroy(rp->red_wtab); 321 rp->red_wtab = NULL; 322 } 323 zfree(red_zone, rp); 324} 325 326void 327red_getstats(red_t *rp, struct red_stats *sp) 328{ 329 sp->q_avg = rp->red_avg >> rp->red_wshift; 330 sp->drop_forced = rp->red_stats.drop_forced; 331 sp->drop_unforced = rp->red_stats.drop_unforced; 332 sp->marked_packets = rp->red_stats.marked_packets; 333} 334 335int 336red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct pf_mtag *tag) 337{ 338#if !PF_ECN 339#pragma unused(tag) 340#endif /* !PF_ECN */ 341 int avg, droptype; 342 int n; 343 344 avg = rp->red_avg; 345 346 /* 347 * if we were idle, we pretend that n packets arrived during 348 * the idle period. 349 */ 350 if (rp->red_idle) { 351 struct timeval now; 352 int t; 353 354 rp->red_idle = 0; 355 microuptime(&now); 356 t = (now.tv_sec - rp->red_last.tv_sec); 357 if (t > 60) { 358 /* 359 * being idle for more than 1 minute, set avg to zero. 360 * this prevents t from overflow. 361 */ 362 avg = 0; 363 } else { 364 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); 365 n = t / rp->red_pkttime - 1; 366 367 /* the following line does (avg = (1 - Wq)^n * avg) */ 368 if (n > 0) 369 avg = (avg >> FP_SHIFT) * 370 pow_w(rp->red_wtab, n); 371 } 372 } 373 374 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ 375 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); 376 rp->red_avg = avg; /* save the new value */ 377 378 /* 379 * red_count keeps a tally of arriving traffic that has not 380 * been dropped. 381 */ 382 rp->red_count++; 383 384 /* see if we drop early */ 385 droptype = DTYPE_NODROP; 386 if (avg >= rp->red_thmin_s && qlen(q) > 1) { 387 if (avg >= rp->red_thmax_s) { 388 /* avg >= th_max: forced drop */ 389 droptype = DTYPE_FORCED; 390 } else if (rp->red_old == 0) { 391 /* first exceeds th_min */ 392 rp->red_count = 1; 393 rp->red_old = 1; 394 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, 395 rp->red_probd, rp->red_count)) { 396 /* mark or drop by red */ 397#if PF_ECN 398 if ((rp->red_flags & REDF_ECN) && 399 (tag->pftag_proto == IPPROTO_TCP) && /* only TCP */ 400 mark_ecn(m, tag, rp->red_flags)) { 401 /* successfully marked. do not drop. */ 402 rp->red_count = 0; 403 rp->red_stats.marked_packets++; 404 } else 405#endif /* PF_ECN */ 406 { 407 /* unforced drop by red */ 408 droptype = DTYPE_EARLY; 409 } 410 } 411 } else { 412 /* avg < th_min */ 413 rp->red_old = 0; 414 } 415 416 /* 417 * if the queue length hits the hard limit, it's a forced drop. 418 */ 419 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) 420 droptype = DTYPE_FORCED; 421 422#ifdef RED_RANDOM_DROP 423 /* if successful or forced drop, enqueue this packet. */ 424 if (droptype != DTYPE_EARLY) 425 _addq(q, m); 426#else 427 /* if successful, enqueue this packet. */ 428 if (droptype == DTYPE_NODROP) 429 _addq(q, m); 430#endif 431 if (droptype != DTYPE_NODROP) { 432 if (droptype == DTYPE_EARLY) { 433 /* drop the incoming packet */ 434 rp->red_stats.drop_unforced++; 435 } else { 436 /* forced drop, select a victim packet in the queue. */ 437#ifdef RED_RANDOM_DROP 438 m = _getq_random(q); 439#endif 440 rp->red_stats.drop_forced++; 441 } 442 rp->red_count = 0; 443 IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd); 444 m_freem(m); 445 return (CLASSQEQ_DROPPED); 446 } 447 /* successfully queued */ 448 return (CLASSQEQ_SUCCESS); 449} 450 451/* 452 * early-drop probability is calculated as follows: 453 * prob = p_max * (avg - th_min) / (th_max - th_min) 454 * prob_a = prob / (2 - count*prob) 455 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) 456 * here prob_a increases as successive undrop count increases. 457 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), 458 * becomes 1 when (count >= (2 / prob))). 459 */ 460int 461drop_early(int fp_len, int fp_probd, int count) 462{ 463 int d; /* denominator of drop-probability */ 464 465 d = fp_probd - count * fp_len; 466 if (d <= 0) 467 /* count exceeds the hard limit: drop or mark */ 468 return (1); 469 470 /* 471 * now the range of d is [1..600] in fixed-point. (when 472 * th_max-th_min=10 and p_max=1/30) 473 * drop probability = (avg - TH_MIN) / d 474 */ 475 476 if ((RandomULong() % d) < (unsigned)fp_len) { 477 /* drop or mark */ 478 return (1); 479 } 480 /* no drop/mark */ 481 return (0); 482} 483 484static struct mbuf * 485red_getq_flow(struct red *rp, class_queue_t *q, u_int32_t flow, boolean_t purge) 486{ 487#pragma unused(purge) 488 struct mbuf *m; 489 490 /* flow of 0 means head of queue */ 491 if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) { 492 if (rp->red_idle == 0) { 493 rp->red_idle = 1; 494 microuptime(&rp->red_last); 495 } 496 return (NULL); 497 } 498 499 rp->red_idle = 0; 500 return (m); 501} 502 503struct mbuf * 504red_getq(red_t *rp, class_queue_t *q) 505{ 506 return (red_getq_flow(rp, q, 0, FALSE)); 507} 508 509void 510red_purgeq(struct red *rp, class_queue_t *q, u_int32_t flow, u_int32_t *packets, 511 u_int32_t *bytes) 512{ 513 u_int32_t cnt = 0, len = 0; 514 struct mbuf *m; 515 516 IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd); 517 518 while ((m = red_getq_flow(rp, q, flow, TRUE)) != NULL) { 519 cnt++; 520 len += m_pktlen(m); 521 m_freem(m); 522 } 523 524 if (packets != NULL) 525 *packets = cnt; 526 if (bytes != NULL) 527 *bytes = len; 528} 529 530void 531red_updateq(red_t *rp, cqev_t ev) 532{ 533#pragma unused(rp, ev) 534 /* nothing for now */ 535} 536 537int 538red_suspendq(red_t *rp, class_queue_t *q, boolean_t on) 539{ 540#pragma unused(rp, q, on) 541 return (ENOTSUP); 542} 543 544/* 545 * helper routine to calibrate avg during idle. 546 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point 547 * here Wq = 1/weight and the code assumes Wq is close to zero. 548 * 549 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. 550 */ 551static struct wtab *wtab_list = NULL; /* pointer to wtab list */ 552 553struct wtab * 554wtab_alloc(int weight) 555{ 556 struct wtab *w; 557 int i; 558 559 for (w = wtab_list; w != NULL; w = w->w_next) 560 if (w->w_weight == weight) { 561 w->w_refcount++; 562 return (w); 563 } 564 565 w = _MALLOC(sizeof (struct wtab), M_DEVBUF, M_WAITOK|M_ZERO); 566 if (w == NULL) 567 return (NULL); 568 569 w->w_weight = weight; 570 w->w_refcount = 1; 571 w->w_next = wtab_list; 572 wtab_list = w; 573 574 /* initialize the weight table */ 575 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; 576 for (i = 1; i < 32; i++) { 577 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; 578 if (w->w_tab[i] == 0 && w->w_param_max == 0) 579 w->w_param_max = 1 << i; 580 } 581 582 return (w); 583} 584 585void 586wtab_destroy(struct wtab *w) 587{ 588 struct wtab *prev; 589 590 if (--w->w_refcount > 0) 591 return; 592 593 if (wtab_list == w) 594 wtab_list = w->w_next; 595 else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) 596 if (prev->w_next == w) { 597 prev->w_next = w->w_next; 598 break; 599 } 600 601 _FREE(w, M_DEVBUF); 602} 603 604int32_t 605pow_w(struct wtab *w, int n) 606{ 607 int i, bit; 608 int32_t val; 609 610 if (n >= w->w_param_max) 611 return (0); 612 613 val = 1 << FP_SHIFT; 614 if (n <= 0) 615 return (val); 616 617 bit = 1; 618 i = 0; 619 while (n) { 620 if (n & bit) { 621 val = (val * w->w_tab[i]) >> FP_SHIFT; 622 n &= ~bit; 623 } 624 i++; 625 bit <<= 1; 626 } 627 return (val); 628} 629 630#endif /* CLASSQ_RED */ 631