/* * Copyright (c) 2007-2012 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* $OpenBSD: altq_red.c,v 1.14 2007/09/13 20:40:02 chl Exp $ */ /* $KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $ */ /* * Copyright (C) 1997-2003 * Sony Computer Science Laboratories Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * */ /* * Copyright (c) 1990-1994 Regents of the University of California. * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: * This product includes software developed by the Computer Systems * Engineering Group at Lawrence Berkeley Laboratory. * 4. Neither the name of the University nor of the Laboratory may be used * to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #if CLASSQ_RED #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if INET6 #include #endif #include #include /* * ALTQ/RED (Random Early Detection) implementation using 32-bit * fixed-point calculation. * * written by kjc using the ns code as a reference. * you can learn more about red and ns from Sally's home page at * http://www-nrg.ee.lbl.gov/floyd/ * * most of the red parameter values are fixed in this implementation * to prevent fixed-point overflow/underflow. * if you change the parameters, watch out for overflow/underflow! * * the parameters used are recommended values by Sally. * the corresponding ns config looks: * q_weight=0.00195 * minthresh=5 maxthresh=15 queue-size=60 * linterm=30 * dropmech=drop-tail * bytes=false (can't be handled by 32-bit fixed-point) * doubleq=false dqthresh=false * wait=true */ /* * alternative red parameters for a slow link. * * assume the queue length becomes from zero to L and keeps L, it takes * N packets for q_avg to reach 63% of L. * when q_weight is 0.002, N is about 500 packets. * for a slow link like dial-up, 500 packets takes more than 1 minute! * when q_weight is 0.008, N is about 127 packets. * when q_weight is 0.016, N is about 63 packets. * bursts of 50 packets are allowed for 0.002, bursts of 25 packets * are allowed for 0.016. * see Sally's paper for more details. */ /* normal red parameters */ #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ /* q_weight = 0.00195 */ /* red parameters for a slow link */ #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ /* q_weight = 0.0078125 */ /* red parameters for a very slow link (e.g., dialup) */ #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ /* q_weight = 0.015625 */ /* fixed-point uses 12-bit decimal places */ #define FP_SHIFT 12 /* fixed-point shift */ /* red parameters for drop probability */ #define INV_P_MAX 10 /* inverse of max drop probability */ #define TH_MIN 5 /* min threshold */ #define TH_MAX 15 /* max threshold */ #define RED_LIMIT 60 /* default max queue lenght */ #define RED_ZONE_MAX 32 /* maximum elements in zone */ #define RED_ZONE_NAME "classq_red" /* zone name */ static unsigned int red_size; /* size of zone element */ static struct zone *red_zone; /* zone for red */ /* * our default policy for forced-drop is drop-tail. * (in altq-1.1.2 or earlier, the default was random-drop. * but it makes more sense to punish the cause of the surge.) * to switch to the random-drop policy, define "RED_RANDOM_DROP". */ /* default red parameter values */ static int default_th_min = TH_MIN; static int default_th_max = TH_MAX; static int default_inv_pmax = INV_P_MAX; static struct mbuf *red_getq_flow(struct red *, class_queue_t *, u_int32_t, boolean_t); void red_init(void) { _CASSERT(REDF_ECN4 == CLASSQF_ECN4); _CASSERT(REDF_ECN6 == CLASSQF_ECN6); red_size = sizeof (red_t); red_zone = zinit(red_size, RED_ZONE_MAX * red_size, 0, RED_ZONE_NAME); if (red_zone == NULL) { panic("%s: failed allocating %s", __func__, RED_ZONE_NAME); /* NOTREACHED */ } zone_change(red_zone, Z_EXPAND, TRUE); zone_change(red_zone, Z_CALLERACCT, TRUE); } /* * red support routines */ red_t * red_alloc(struct ifnet *ifp, int weight, int inv_pmax, int th_min, int th_max, int flags, int pkttime) { red_t *rp; int w, i; int npkts_per_sec; VERIFY(ifp != NULL); rp = zalloc(red_zone); if (rp == NULL) return (NULL); bzero(rp, red_size); rp->red_avg = 0; rp->red_idle = 1; if (weight == 0) rp->red_weight = W_WEIGHT; else rp->red_weight = weight; if (inv_pmax == 0) rp->red_inv_pmax = default_inv_pmax; else rp->red_inv_pmax = inv_pmax; if (th_min == 0) rp->red_thmin = default_th_min; else rp->red_thmin = th_min; if (th_max == 0) rp->red_thmax = default_th_max; else rp->red_thmax = th_max; rp->red_ifp = ifp; rp->red_flags = (flags & REDF_USERFLAGS); #if !PF_ECN if (rp->red_flags & REDF_ECN) { rp->red_flags &= ~REDF_ECN; log(LOG_ERR, "%s: RED ECN not available; ignoring " "REDF_ECN flag!\n", if_name(ifp)); } #endif /* !PF_ECN */ if (pkttime == 0) /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ rp->red_pkttime = 800; else rp->red_pkttime = pkttime; if (weight == 0) { /* when the link is very slow, adjust red parameters */ npkts_per_sec = 1000000 / rp->red_pkttime; if (npkts_per_sec < 50) { /* up to about 400Kbps */ rp->red_weight = W_WEIGHT_2; } else if (npkts_per_sec < 300) { /* up to about 2.4Mbps */ rp->red_weight = W_WEIGHT_1; } } /* calculate wshift. weight must be power of 2 */ w = rp->red_weight; for (i = 0; w > 1; i++) w = w >> 1; rp->red_wshift = i; w = 1 << rp->red_wshift; if (w != rp->red_weight) { printf("invalid weight value %d for red! use %d\n", rp->red_weight, w); rp->red_weight = w; } /* * thmin_s and thmax_s are scaled versions of th_min and th_max * to be compared with avg. */ rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); /* * precompute probability denominator * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point */ rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) * rp->red_inv_pmax) << FP_SHIFT; /* allocate weight table */ rp->red_wtab = wtab_alloc(rp->red_weight); if (rp->red_wtab == NULL) { red_destroy(rp); return (NULL); } microuptime(&rp->red_last); return (rp); } void red_destroy(red_t *rp) { if (rp->red_wtab != NULL) { wtab_destroy(rp->red_wtab); rp->red_wtab = NULL; } zfree(red_zone, rp); } void red_getstats(red_t *rp, struct red_stats *sp) { sp->q_avg = rp->red_avg >> rp->red_wshift; sp->drop_forced = rp->red_stats.drop_forced; sp->drop_unforced = rp->red_stats.drop_unforced; sp->marked_packets = rp->red_stats.marked_packets; } int red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct pf_mtag *tag) { #if !PF_ECN #pragma unused(tag) #endif /* !PF_ECN */ int avg, droptype; int n; avg = rp->red_avg; /* * if we were idle, we pretend that n packets arrived during * the idle period. */ if (rp->red_idle) { struct timeval now; int t; rp->red_idle = 0; microuptime(&now); t = (now.tv_sec - rp->red_last.tv_sec); if (t > 60) { /* * being idle for more than 1 minute, set avg to zero. * this prevents t from overflow. */ avg = 0; } else { t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); n = t / rp->red_pkttime - 1; /* the following line does (avg = (1 - Wq)^n * avg) */ if (n > 0) avg = (avg >> FP_SHIFT) * pow_w(rp->red_wtab, n); } } /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); rp->red_avg = avg; /* save the new value */ /* * red_count keeps a tally of arriving traffic that has not * been dropped. */ rp->red_count++; /* see if we drop early */ droptype = DTYPE_NODROP; if (avg >= rp->red_thmin_s && qlen(q) > 1) { if (avg >= rp->red_thmax_s) { /* avg >= th_max: forced drop */ droptype = DTYPE_FORCED; } else if (rp->red_old == 0) { /* first exceeds th_min */ rp->red_count = 1; rp->red_old = 1; } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, rp->red_probd, rp->red_count)) { /* mark or drop by red */ #if PF_ECN if ((rp->red_flags & REDF_ECN) && (tag->pftag_proto == IPPROTO_TCP) && /* only TCP */ mark_ecn(m, tag, rp->red_flags)) { /* successfully marked. do not drop. */ rp->red_count = 0; rp->red_stats.marked_packets++; } else #endif /* PF_ECN */ { /* unforced drop by red */ droptype = DTYPE_EARLY; } } } else { /* avg < th_min */ rp->red_old = 0; } /* * if the queue length hits the hard limit, it's a forced drop. */ if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) droptype = DTYPE_FORCED; #ifdef RED_RANDOM_DROP /* if successful or forced drop, enqueue this packet. */ if (droptype != DTYPE_EARLY) _addq(q, m); #else /* if successful, enqueue this packet. */ if (droptype == DTYPE_NODROP) _addq(q, m); #endif if (droptype != DTYPE_NODROP) { if (droptype == DTYPE_EARLY) { /* drop the incoming packet */ rp->red_stats.drop_unforced++; } else { /* forced drop, select a victim packet in the queue. */ #ifdef RED_RANDOM_DROP m = _getq_random(q); #endif rp->red_stats.drop_forced++; } rp->red_count = 0; IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd); m_freem(m); return (CLASSQEQ_DROPPED); } /* successfully queued */ return (CLASSQEQ_SUCCESS); } /* * early-drop probability is calculated as follows: * prob = p_max * (avg - th_min) / (th_max - th_min) * prob_a = prob / (2 - count*prob) * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) * here prob_a increases as successive undrop count increases. * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), * becomes 1 when (count >= (2 / prob))). */ int drop_early(int fp_len, int fp_probd, int count) { int d; /* denominator of drop-probability */ d = fp_probd - count * fp_len; if (d <= 0) /* count exceeds the hard limit: drop or mark */ return (1); /* * now the range of d is [1..600] in fixed-point. (when * th_max-th_min=10 and p_max=1/30) * drop probability = (avg - TH_MIN) / d */ if ((RandomULong() % d) < (unsigned)fp_len) { /* drop or mark */ return (1); } /* no drop/mark */ return (0); } static struct mbuf * red_getq_flow(struct red *rp, class_queue_t *q, u_int32_t flow, boolean_t purge) { #pragma unused(purge) struct mbuf *m; /* flow of 0 means head of queue */ if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) { if (rp->red_idle == 0) { rp->red_idle = 1; microuptime(&rp->red_last); } return (NULL); } rp->red_idle = 0; return (m); } struct mbuf * red_getq(red_t *rp, class_queue_t *q) { return (red_getq_flow(rp, q, 0, FALSE)); } void red_purgeq(struct red *rp, class_queue_t *q, u_int32_t flow, u_int32_t *packets, u_int32_t *bytes) { u_int32_t cnt = 0, len = 0; struct mbuf *m; IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd); while ((m = red_getq_flow(rp, q, flow, TRUE)) != NULL) { cnt++; len += m_pktlen(m); m_freem(m); } if (packets != NULL) *packets = cnt; if (bytes != NULL) *bytes = len; } void red_updateq(red_t *rp, cqev_t ev) { #pragma unused(rp, ev) /* nothing for now */ } int red_suspendq(red_t *rp, class_queue_t *q, boolean_t on) { #pragma unused(rp, q, on) return (ENOTSUP); } /* * helper routine to calibrate avg during idle. * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point * here Wq = 1/weight and the code assumes Wq is close to zero. * * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. */ static struct wtab *wtab_list = NULL; /* pointer to wtab list */ struct wtab * wtab_alloc(int weight) { struct wtab *w; int i; for (w = wtab_list; w != NULL; w = w->w_next) if (w->w_weight == weight) { w->w_refcount++; return (w); } w = _MALLOC(sizeof (struct wtab), M_DEVBUF, M_WAITOK|M_ZERO); if (w == NULL) return (NULL); w->w_weight = weight; w->w_refcount = 1; w->w_next = wtab_list; wtab_list = w; /* initialize the weight table */ w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; for (i = 1; i < 32; i++) { w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; if (w->w_tab[i] == 0 && w->w_param_max == 0) w->w_param_max = 1 << i; } return (w); } void wtab_destroy(struct wtab *w) { struct wtab *prev; if (--w->w_refcount > 0) return; if (wtab_list == w) wtab_list = w->w_next; else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next) if (prev->w_next == w) { prev->w_next = w->w_next; break; } _FREE(w, M_DEVBUF); } int32_t pow_w(struct wtab *w, int n) { int i, bit; int32_t val; if (n >= w->w_param_max) return (0); val = 1 << FP_SHIFT; if (n <= 0) return (val); bit = 1; i = 0; while (n) { if (n & bit) { val = (val * w->w_tab[i]) >> FP_SHIFT; n &= ~bit; } i++; bit <<= 1; } return (val); } #endif /* CLASSQ_RED */