1/*
2 * Copyright (c) 2007-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*	$OpenBSD: altq_red.c,v 1.14 2007/09/13 20:40:02 chl Exp $	*/
30/*	$KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $	*/
31
32/*
33 * Copyright (C) 1997-2003
34 *	Sony Computer Science Laboratories Inc.  All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 *
45 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
46 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
49 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
50 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
51 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55 * SUCH DAMAGE.
56 *
57 */
58/*
59 * Copyright (c) 1990-1994 Regents of the University of California.
60 * All rights reserved.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 *    notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 *    notice, this list of conditions and the following disclaimer in the
69 *    documentation and/or other materials provided with the distribution.
70 * 3. All advertising materials mentioning features or use of this software
71 *    must display the following acknowledgement:
72 *	This product includes software developed by the Computer Systems
73 *	Engineering Group at Lawrence Berkeley Laboratory.
74 * 4. Neither the name of the University nor of the Laboratory may be used
75 *    to endorse or promote products derived from this software without
76 *    specific prior written permission.
77 *
78 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
79 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
80 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
81 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
82 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
83 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
84 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
85 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
86 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
87 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
88 * SUCH DAMAGE.
89 */
90
91#include <sys/cdefs.h>
92
93#if CLASSQ_RED
94
95#include <sys/param.h>
96#include <sys/malloc.h>
97#include <sys/mbuf.h>
98#include <sys/socket.h>
99#include <sys/systm.h>
100#include <sys/errno.h>
101#include <sys/kauth.h>
102
103#include <kern/zalloc.h>
104
105#include <net/if.h>
106
107#include <netinet/in.h>
108#include <netinet/in_systm.h>
109#include <netinet/ip.h>
110#if INET6
111#include <netinet/ip6.h>
112#endif
113
114#include <net/classq/classq_red.h>
115
116/*
117 * ALTQ/RED (Random Early Detection) implementation using 32-bit
118 * fixed-point calculation.
119 *
120 * written by kjc using the ns code as a reference.
121 * you can learn more about red and ns from Sally's home page at
122 * http://www-nrg.ee.lbl.gov/floyd/
123 *
124 * most of the red parameter values are fixed in this implementation
125 * to prevent fixed-point overflow/underflow.
126 * if you change the parameters, watch out for overflow/underflow!
127 *
128 * the parameters used are recommended values by Sally.
129 * the corresponding ns config looks:
130 *	q_weight=0.00195
131 *	minthresh=5 maxthresh=15 queue-size=60
132 *	linterm=30
133 *	dropmech=drop-tail
134 *	bytes=false (can't be handled by 32-bit fixed-point)
135 *	doubleq=false dqthresh=false
136 *	wait=true
137 */
138/*
139 * alternative red parameters for a slow link.
140 *
141 * assume the queue length becomes from zero to L and keeps L, it takes
142 * N packets for q_avg to reach 63% of L.
143 * when q_weight is 0.002, N is about 500 packets.
144 * for a slow link like dial-up, 500 packets takes more than 1 minute!
145 * when q_weight is 0.008, N is about 127 packets.
146 * when q_weight is 0.016, N is about 63 packets.
147 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
148 * are allowed for 0.016.
149 * see Sally's paper for more details.
150 */
151/* normal red parameters */
152#define	W_WEIGHT	512	/* inverse of weight of EWMA (511/512) */
153				/* q_weight = 0.00195 */
154
155/* red parameters for a slow link */
156#define	W_WEIGHT_1	128	/* inverse of weight of EWMA (127/128) */
157				/* q_weight = 0.0078125 */
158
159/* red parameters for a very slow link (e.g., dialup) */
160#define	W_WEIGHT_2	64	/* inverse of weight of EWMA (63/64) */
161				/* q_weight = 0.015625 */
162
163/* fixed-point uses 12-bit decimal places */
164#define	FP_SHIFT	12	/* fixed-point shift */
165
166/* red parameters for drop probability */
167#define	INV_P_MAX	10	/* inverse of max drop probability */
168#define	TH_MIN		5	/* min threshold */
169#define	TH_MAX		15	/* max threshold */
170
171#define	RED_LIMIT	60	/* default max queue lenght */
172
173#define	RED_ZONE_MAX	32		/* maximum elements in zone */
174#define	RED_ZONE_NAME	"classq_red"	/* zone name */
175
176static unsigned int red_size;		/* size of zone element */
177static struct zone *red_zone;		/* zone for red */
178
179/*
180 * our default policy for forced-drop is drop-tail.
181 * (in altq-1.1.2 or earlier, the default was random-drop.
182 * but it makes more sense to punish the cause of the surge.)
183 * to switch to the random-drop policy, define "RED_RANDOM_DROP".
184 */
185
186/* default red parameter values */
187static int default_th_min = TH_MIN;
188static int default_th_max = TH_MAX;
189static int default_inv_pmax = INV_P_MAX;
190
191static struct mbuf *red_getq_flow(struct red *, class_queue_t *,
192    u_int32_t, boolean_t);
193
194void
195red_init(void)
196{
197	_CASSERT(REDF_ECN4 == CLASSQF_ECN4);
198	_CASSERT(REDF_ECN6 == CLASSQF_ECN6);
199
200	red_size = sizeof (red_t);
201	red_zone = zinit(red_size, RED_ZONE_MAX * red_size,
202	    0, RED_ZONE_NAME);
203	if (red_zone == NULL) {
204		panic("%s: failed allocating %s", __func__, RED_ZONE_NAME);
205		/* NOTREACHED */
206	}
207	zone_change(red_zone, Z_EXPAND, TRUE);
208	zone_change(red_zone, Z_CALLERACCT, TRUE);
209}
210
211/*
212 * red support routines
213 */
214red_t *
215red_alloc(struct ifnet *ifp, int weight, int inv_pmax, int th_min,
216    int th_max, int flags, int pkttime)
217{
218	red_t	*rp;
219	int	 w, i;
220	int	 npkts_per_sec;
221
222	VERIFY(ifp != NULL);
223
224	rp = zalloc(red_zone);
225	if (rp == NULL)
226		return (NULL);
227
228	bzero(rp, red_size);
229	rp->red_avg = 0;
230	rp->red_idle = 1;
231
232	if (weight == 0)
233		rp->red_weight = W_WEIGHT;
234	else
235		rp->red_weight = weight;
236	if (inv_pmax == 0)
237		rp->red_inv_pmax = default_inv_pmax;
238	else
239		rp->red_inv_pmax = inv_pmax;
240	if (th_min == 0)
241		rp->red_thmin = default_th_min;
242	else
243		rp->red_thmin = th_min;
244	if (th_max == 0)
245		rp->red_thmax = default_th_max;
246	else
247		rp->red_thmax = th_max;
248
249	rp->red_flags = (flags & REDF_USERFLAGS);
250	rp->red_ifp = ifp;
251
252	if (pkttime == 0)
253		/* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
254		rp->red_pkttime = 800;
255	else
256		rp->red_pkttime = pkttime;
257
258	if (weight == 0) {
259		/* when the link is very slow, adjust red parameters */
260		npkts_per_sec = 1000000 / rp->red_pkttime;
261		if (npkts_per_sec < 50) {
262			/* up to about 400Kbps */
263			rp->red_weight = W_WEIGHT_2;
264		} else if (npkts_per_sec < 300) {
265			/* up to about 2.4Mbps */
266			rp->red_weight = W_WEIGHT_1;
267		}
268	}
269
270	/* calculate wshift.  weight must be power of 2 */
271	w = rp->red_weight;
272	for (i = 0; w > 1; i++)
273		w = w >> 1;
274	rp->red_wshift = i;
275	w = 1 << rp->red_wshift;
276	if (w != rp->red_weight) {
277		printf("invalid weight value %d for red! use %d\n",
278		    rp->red_weight, w);
279		rp->red_weight = w;
280	}
281
282	/*
283	 * thmin_s and thmax_s are scaled versions of th_min and th_max
284	 * to be compared with avg.
285	 */
286	rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
287	rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
288
289	/*
290	 * precompute probability denominator
291	 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
292	 */
293	rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) *
294	    rp->red_inv_pmax) << FP_SHIFT;
295
296	/* allocate weight table */
297	rp->red_wtab = wtab_alloc(rp->red_weight);
298	if (rp->red_wtab == NULL) {
299		red_destroy(rp);
300		return (NULL);
301	}
302
303	microuptime(&rp->red_last);
304	return (rp);
305}
306
307void
308red_destroy(red_t *rp)
309{
310	if (rp->red_wtab != NULL) {
311		wtab_destroy(rp->red_wtab);
312		rp->red_wtab = NULL;
313	}
314	zfree(red_zone, rp);
315}
316
317void
318red_getstats(red_t *rp, struct red_stats *sp)
319{
320	sp->q_avg		= rp->red_avg >> rp->red_wshift;
321	sp->drop_forced		= rp->red_stats.drop_forced;
322	sp->drop_unforced	= rp->red_stats.drop_unforced;
323	sp->marked_packets	= rp->red_stats.marked_packets;
324}
325
326int
327red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct pf_mtag *tag)
328{
329	int avg, droptype;
330	int n;
331
332	avg = rp->red_avg;
333
334	/*
335	 * if we were idle, we pretend that n packets arrived during
336	 * the idle period.
337	 */
338	if (rp->red_idle) {
339		struct timeval now;
340		int t;
341
342		rp->red_idle = 0;
343		microuptime(&now);
344		t = (now.tv_sec - rp->red_last.tv_sec);
345		if (t > 60) {
346			/*
347			 * being idle for more than 1 minute, set avg to zero.
348			 * this prevents t from overflow.
349			 */
350			avg = 0;
351		} else {
352			t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
353			n = t / rp->red_pkttime - 1;
354
355			/* the following line does (avg = (1 - Wq)^n * avg) */
356			if (n > 0)
357				avg = (avg >> FP_SHIFT) *
358				    pow_w(rp->red_wtab, n);
359		}
360	}
361
362	/* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
363	avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
364	rp->red_avg = avg;		/* save the new value */
365
366	/*
367	 * red_count keeps a tally of arriving traffic that has not
368	 * been dropped.
369	 */
370	rp->red_count++;
371
372	/* see if we drop early */
373	droptype = DTYPE_NODROP;
374	if (avg >= rp->red_thmin_s && qlen(q) > 1) {
375		if (avg >= rp->red_thmax_s) {
376			/* avg >= th_max: forced drop */
377			droptype = DTYPE_FORCED;
378		} else if (rp->red_old == 0) {
379			/* first exceeds th_min */
380			rp->red_count = 1;
381			rp->red_old = 1;
382		} else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
383		    rp->red_probd, rp->red_count)) {
384			/* mark or drop by red */
385			if ((rp->red_flags & REDF_ECN) &&
386			    (tag->pftag_flags & PF_TAG_TCP) &&	/* only TCP */
387			    mark_ecn(m, tag, rp->red_flags)) {
388				/* successfully marked.  do not drop. */
389				rp->red_count = 0;
390				rp->red_stats.marked_packets++;
391			} else {
392				/* unforced drop by red */
393				droptype = DTYPE_EARLY;
394			}
395		}
396	} else {
397		/* avg < th_min */
398		rp->red_old = 0;
399	}
400
401	/*
402	 * if the queue length hits the hard limit, it's a forced drop.
403	 */
404	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
405		droptype = DTYPE_FORCED;
406
407#ifdef RED_RANDOM_DROP
408	/* if successful or forced drop, enqueue this packet. */
409	if (droptype != DTYPE_EARLY)
410		_addq(q, m);
411#else
412	/* if successful, enqueue this packet. */
413	if (droptype == DTYPE_NODROP)
414		_addq(q, m);
415#endif
416	if (droptype != DTYPE_NODROP) {
417		if (droptype == DTYPE_EARLY) {
418			/* drop the incoming packet */
419			rp->red_stats.drop_unforced++;
420		} else {
421			/* forced drop, select a victim packet in the queue. */
422#ifdef RED_RANDOM_DROP
423			m = _getq_random(q);
424#endif
425			rp->red_stats.drop_forced++;
426		}
427		rp->red_count = 0;
428		IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd);
429		m_freem(m);
430		return (CLASSQEQ_DROPPED);
431	}
432	/* successfully queued */
433	return (CLASSQEQ_SUCCESS);
434}
435
436/*
437 * early-drop probability is calculated as follows:
438 *   prob = p_max * (avg - th_min) / (th_max - th_min)
439 *   prob_a = prob / (2 - count*prob)
440 *	    = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
441 * here prob_a increases as successive undrop count increases.
442 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
443 * becomes 1 when (count >= (2 / prob))).
444 */
445int
446drop_early(int fp_len, int fp_probd, int count)
447{
448	int	d;		/* denominator of drop-probability */
449
450	d = fp_probd - count * fp_len;
451	if (d <= 0)
452		/* count exceeds the hard limit: drop or mark */
453		return (1);
454
455	/*
456	 * now the range of d is [1..600] in fixed-point. (when
457	 * th_max-th_min=10 and p_max=1/30)
458	 * drop probability = (avg - TH_MIN) / d
459	 */
460
461	if ((random() % d) < (unsigned)fp_len) {
462		/* drop or mark */
463		return (1);
464	}
465	/* no drop/mark */
466	return (0);
467}
468
469static struct mbuf *
470red_getq_flow(struct red *rp, class_queue_t *q, u_int32_t flow, boolean_t purge)
471{
472#pragma unused(purge)
473	struct mbuf *m;
474
475	/* flow of 0 means head of queue */
476	if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) {
477		if (rp->red_idle == 0) {
478			rp->red_idle = 1;
479			microuptime(&rp->red_last);
480		}
481		return (NULL);
482	}
483
484	rp->red_idle = 0;
485	return (m);
486}
487
488struct mbuf *
489red_getq(red_t *rp, class_queue_t *q)
490{
491	return (red_getq_flow(rp, q, 0, FALSE));
492}
493
494void
495red_purgeq(struct red *rp, class_queue_t *q, u_int32_t flow, u_int32_t *packets,
496    u_int32_t *bytes)
497{
498	u_int32_t cnt = 0, len = 0;
499	struct mbuf *m;
500
501	IFCQ_CONVERT_LOCK(&rp->red_ifp->if_snd);
502
503	while ((m = red_getq_flow(rp, q, flow, TRUE)) != NULL) {
504		cnt++;
505		len += m_pktlen(m);
506		m_freem(m);
507	}
508
509	if (packets != NULL)
510		*packets = cnt;
511	if (bytes != NULL)
512		*bytes = len;
513}
514
515void
516red_updateq(red_t *rp, cqev_t ev)
517{
518#pragma unused(rp, ev)
519	/* nothing for now */
520}
521
522int
523red_suspendq(red_t *rp, class_queue_t *q, boolean_t on)
524{
525#pragma unused(rp, q, on)
526	return (ENOTSUP);
527}
528
529/*
530 * helper routine to calibrate avg during idle.
531 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
532 * here Wq = 1/weight and the code assumes Wq is close to zero.
533 *
534 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
535 */
536static struct wtab *wtab_list = NULL;	/* pointer to wtab list */
537
538struct wtab *
539wtab_alloc(int weight)
540{
541	struct wtab	*w;
542	int		 i;
543
544	for (w = wtab_list; w != NULL; w = w->w_next)
545		if (w->w_weight == weight) {
546			w->w_refcount++;
547			return (w);
548		}
549
550	w = _MALLOC(sizeof (struct wtab), M_DEVBUF, M_WAITOK|M_ZERO);
551	if (w == NULL)
552		return (NULL);
553
554	w->w_weight = weight;
555	w->w_refcount = 1;
556	w->w_next = wtab_list;
557	wtab_list = w;
558
559	/* initialize the weight table */
560	w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
561	for (i = 1; i < 32; i++) {
562		w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
563		if (w->w_tab[i] == 0 && w->w_param_max == 0)
564			w->w_param_max = 1 << i;
565	}
566
567	return (w);
568}
569
570void
571wtab_destroy(struct wtab *w)
572{
573	struct wtab	*prev;
574
575	if (--w->w_refcount > 0)
576		return;
577
578	if (wtab_list == w)
579		wtab_list = w->w_next;
580	else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
581		if (prev->w_next == w) {
582			prev->w_next = w->w_next;
583			break;
584		}
585
586	_FREE(w, M_DEVBUF);
587}
588
589int32_t
590pow_w(struct wtab *w, int n)
591{
592	int	i, bit;
593	int32_t	val;
594
595	if (n >= w->w_param_max)
596		return (0);
597
598	val = 1 << FP_SHIFT;
599	if (n <= 0)
600		return (val);
601
602	bit = 1;
603	i = 0;
604	while (n) {
605		if (n & bit) {
606			val = (val * w->w_tab[i]) >> FP_SHIFT;
607			n &= ~bit;
608		}
609		i++;
610		bit <<=  1;
611	}
612	return (val);
613}
614
615#endif /* CLASSQ_RED */
616