1/*
2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/cdefs.h>
30#include <sys/param.h>
31#include <sys/mbuf.h>
32#include <sys/socket.h>
33#include <sys/sockio.h>
34#include <sys/systm.h>
35#include <sys/sysctl.h>
36#include <sys/syslog.h>
37#include <sys/proc.h>
38#include <sys/errno.h>
39#include <sys/kernel.h>
40#include <sys/kauth.h>
41
42#include <kern/zalloc.h>
43
44#include <net/if.h>
45#include <net/if_var.h>
46#include <net/if_types.h>
47#include <net/dlil.h>
48#include <net/flowadv.h>
49
50#include <netinet/in.h>
51#include <netinet/in_systm.h>
52#include <netinet/ip.h>
53#if INET6
54#include <netinet/ip6.h>
55#endif
56
57#include <net/classq/classq_sfb.h>
58#include <net/flowhash.h>
59#include <net/net_osdep.h>
60#include <dev/random/randomdev.h>
61
62/*
63 * Stochastic Fair Blue
64 *
65 * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin
66 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf
67 *
68 * Based on the NS code with the following parameters:
69 *
70 *   bytes:	false
71 *   decrement:	0.001
72 *   increment:	0.005
73 *   hold-time:	10ms-50ms (randomized)
74 *   algorithm:	0
75 *   pbox:	1
76 *   pbox-time:	50-100ms (randomized)
77 *   hinterval:	11-23 (randomized)
78 *
79 * This implementation uses L = 2 and N = 32 for 2 sets of:
80 *
81 *	B[L][N]: L x N array of bins (L levels, N bins per level)
82 *
83 * Each set effectively creates 32^2 virtual buckets (bin combinations)
84 * while using only O(32*2) states.
85 *
86 * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are
87 * used as index for the bins across the 2 levels, where level 1 uses [0,2]
88 * and level 2 uses [1,3].  The 2 values per level correspond to the indices
89 * for the current and warm-up sets (section 4.4. in the SFB paper regarding
90 * Moving Hash Functions explains the purposes of these 2 sets.)
91 */
92
93/*
94 * Use Murmur3A_x86_32 for hash function.  It seems to perform consistently
95 * across platforms for 1-word key (32-bit flowhash value).  See flowhash.h
96 * for other alternatives.  We only need 16-bit hash output.
97 */
98#define	SFB_HASH	net_flowhash_mh3_x86_32
99#define	SFB_HASHMASK	HASHMASK(16)
100
101#define	SFB_BINMASK(_x) \
102	((_x) & HASHMASK(SFB_BINS_SHIFT))
103
104#define	SFB_BINST(_sp, _l, _n, _c) \
105	(&(*(_sp)->sfb_bins)[_c].stats[_l][_n])
106
107#define	SFB_BINFT(_sp, _l, _n, _c) \
108	(&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n])
109
110#define	SFB_FC_LIST(_sp, _n) \
111	(&(*(_sp)->sfb_fc_lists)[_n])
112
113/*
114 * The holdtime parameter determines the minimum time interval between
115 * two successive updates of the marking probability.  In the event the
116 * uplink speed is not known, a default value is chosen and is randomized
117 * to be within the following range.
118 */
119#define	HOLDTIME_BASE	(100ULL * 1000 * 1000)	/* 100ms */
120#define	HOLDTIME_MIN	(10ULL * 1000 * 1000)	/* 10ms */
121#define	HOLDTIME_MAX	(100ULL * 1000 * 1000)	/* 100ms */
122
123/*
124 * The pboxtime parameter determines the bandwidth allocated for rogue
125 * flows, i.e. the rate limiting bandwidth.  In the event the uplink speed
126 * is not known, a default value is chosen and is randomized to be within
127 * the following range.
128 */
129#define	PBOXTIME_BASE	(300ULL * 1000 * 1000)	/* 300ms */
130#define	PBOXTIME_MIN	(30ULL * 1000 * 1000)	/* 30ms */
131#define	PBOXTIME_MAX	(300ULL * 1000 * 1000)	/* 300ms */
132
133#define	SFB_RANDOM(sp, tmin, tmax)	((sfb_random(sp) % (tmax)) + (tmin))
134
135#define	SFB_PKT_PBOX	0x1		/* in penalty box */
136
137/* The following mantissa values are in SFB_FP_SHIFT Q format */
138#define	SFB_MAX_PMARK	(1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */
139
140/*
141 * These are d1 (increment) and d2 (decrement) parameters, used to determine
142 * the amount by which the marking probability is incremented when the queue
143 * overflows, or is decremented when the link is idle.  d1 is set higher than
144 * d2, because link underutilization can occur when congestion management is
145 * either too conservative or too aggressive, but packet loss occurs only
146 * when congestion management is too conservative.  By weighing heavily
147 * against packet loss, it can quickly reach to a substantial increase in
148 * traffic load.
149 */
150#define	SFB_INCREMENT	82		/* Q14 representation of 0.005 */
151#define	SFB_DECREMENT	16		/* Q14 representation of 0.001 */
152
153#define	SFB_PMARK_TH	16056		/* Q14 representation of 0.98 */
154#define	SFB_PMARK_WARM	3276		/* Q14 representation of 0.2 */
155
156#define	SFB_PMARK_INC(_bin) do {					\
157	(_bin)->pmark += sfb_increment;					\
158	if ((_bin)->pmark > SFB_MAX_PMARK)				\
159		(_bin)->pmark = SFB_MAX_PMARK;				\
160} while (0)
161
162#define	SFB_PMARK_DEC(_bin) do {					\
163	if ((_bin)->pmark > 0) {					\
164		(_bin)->pmark -= sfb_decrement;				\
165		if ((_bin)->pmark < 0)					\
166			(_bin)->pmark = 0;				\
167	}								\
168} while (0)
169
170#define	HINTERVAL_MIN	(10)	/* 10 seconds */
171#define	HINTERVAL_MAX	(20)	/* 20 seconds */
172#define	SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN)
173
174#define	DEQUEUE_DECAY	7		/* ilog2 of EWMA decay rate, (128) */
175#define	DEQUEUE_SPIKE(_new, _old)	\
176	((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11))
177
178#define	ABS(v)  (((v) > 0) ? (v) : -(v))
179
180#define	SFB_ZONE_MAX		32		/* maximum elements in zone */
181#define	SFB_ZONE_NAME		"classq_sfb"	/* zone name */
182
183#define	SFB_BINS_ZONE_MAX	32		/* maximum elements in zone */
184#define	SFB_BINS_ZONE_NAME	"classq_sfb_bins" /* zone name */
185
186#define	SFB_FCL_ZONE_MAX	32		/* maximum elements in zone */
187#define	SFB_FCL_ZONE_NAME	"classq_sfb_fcl" /* zone name */
188
189/* Place the flow control entries in current bin on level 0 */
190#define	SFB_FC_LEVEL	0
191
192/* Store SFB hash and flags in the module private scratch space */
193#define	pkt_sfb_hash8	pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val8
194#define	pkt_sfb_hash16	pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val16
195#define	pkt_sfb_hash32	pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val32
196#define	pkt_sfb_flags	pkt_mpriv.__mpriv_u.__mpriv32[1].__mpriv32_u.__val32
197
198static unsigned int sfb_size;		/* size of zone element */
199static struct zone *sfb_zone;		/* zone for sfb */
200
201static unsigned int sfb_bins_size;	/* size of zone element */
202static struct zone *sfb_bins_zone;	/* zone for sfb_bins */
203
204static unsigned int sfb_fcl_size;	/* size of zone element */
205static struct zone *sfb_fcl_zone;	/* zone for sfb_fc_lists */
206
207/* internal function prototypes */
208static u_int32_t sfb_random(struct sfb *);
209static struct mbuf *sfb_getq_flow(struct sfb *, class_queue_t *, u_int32_t,
210    boolean_t);
211static void sfb_resetq(struct sfb *, cqev_t);
212static void sfb_calc_holdtime(struct sfb *, u_int64_t);
213static void sfb_calc_pboxtime(struct sfb *, u_int64_t);
214static void sfb_calc_hinterval(struct sfb *, u_int64_t *);
215static void sfb_swap_bins(struct sfb *, u_int32_t);
216static inline int sfb_pcheck(struct sfb *, struct pkthdr *);
217static int sfb_penalize(struct sfb *, struct pkthdr *, struct timespec *);
218static void sfb_adjust_bin(struct sfb *, struct sfbbinstats *,
219    struct timespec *, struct timespec *, boolean_t);
220static void sfb_decrement_bin(struct sfb *, struct sfbbinstats *,
221    struct timespec *, struct timespec *);
222static void sfb_increment_bin(struct sfb *, struct sfbbinstats *,
223    struct timespec *, struct timespec *);
224static inline void sfb_dq_update_bins(struct sfb *, struct pkthdr *,
225    struct timespec *);
226static inline void sfb_eq_update_bins(struct sfb *, struct pkthdr *);
227static int sfb_drop_early(struct sfb *, struct pkthdr *, u_int16_t *,
228    struct timespec *);
229static boolean_t sfb_bin_addfcentry(struct sfb *, struct pkthdr *);
230static void sfb_fclist_append(struct sfb *, struct sfb_fcl *);
231static void sfb_fclists_clean(struct sfb *sp);
232
233SYSCTL_NODE(_net_classq, OID_AUTO, sfb, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SFB");
234
235static u_int64_t sfb_holdtime = 0;	/* 0 indicates "automatic" */
236SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, holdtime, CTLFLAG_RW|CTLFLAG_LOCKED,
237    &sfb_holdtime, "SFB freeze time in nanoseconds");
238
239static u_int64_t sfb_pboxtime = 0;	/* 0 indicates "automatic" */
240SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, pboxtime, CTLFLAG_RW|CTLFLAG_LOCKED,
241    &sfb_pboxtime, "SFB penalty box time in nanoseconds");
242
243static u_int64_t sfb_hinterval;
244SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, hinterval, CTLFLAG_RW|CTLFLAG_LOCKED,
245    &sfb_hinterval, "SFB hash interval in nanoseconds");
246
247static u_int32_t sfb_increment = SFB_INCREMENT;
248SYSCTL_UINT(_net_classq_sfb, OID_AUTO, increment, CTLFLAG_RW|CTLFLAG_LOCKED,
249    &sfb_increment, SFB_INCREMENT, "SFB increment [d1]");
250
251static u_int32_t sfb_decrement = SFB_DECREMENT;
252SYSCTL_UINT(_net_classq_sfb, OID_AUTO, decrement, CTLFLAG_RW|CTLFLAG_LOCKED,
253    &sfb_decrement, SFB_DECREMENT, "SFB decrement [d2]");
254
255static u_int32_t sfb_allocation = 0;	/* 0 means "automatic" */
256SYSCTL_UINT(_net_classq_sfb, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED,
257    &sfb_allocation, 0, "SFB bin allocation");
258
259static u_int32_t sfb_ratelimit = 0;
260SYSCTL_UINT(_net_classq_sfb, OID_AUTO, ratelimit, CTLFLAG_RW|CTLFLAG_LOCKED,
261	&sfb_ratelimit, 0, "SFB rate limit");
262
263#define	MBPS	(1ULL * 1000 * 1000)
264#define	GBPS	(MBPS * 1000)
265
266struct sfb_time_tbl {
267	u_int64_t	speed;		/* uplink speed */
268	u_int64_t	holdtime;	/* hold time */
269	u_int64_t	pboxtime;	/* penalty box time */
270};
271
272static struct sfb_time_tbl sfb_ttbl[] = {
273	{   1 * MBPS,	HOLDTIME_BASE * 1000,	PBOXTIME_BASE * 1000	},
274	{  10 * MBPS,	HOLDTIME_BASE * 100,	PBOXTIME_BASE * 100	},
275	{ 100 * MBPS,	HOLDTIME_BASE * 10,	PBOXTIME_BASE * 10	},
276	{   1 * GBPS,	HOLDTIME_BASE,		PBOXTIME_BASE		},
277	{  10 * GBPS,	HOLDTIME_BASE / 10,	PBOXTIME_BASE / 10	},
278	{ 100 * GBPS,	HOLDTIME_BASE / 100,	PBOXTIME_BASE / 100	},
279	{ 0, 0, 0 }
280};
281
282void
283sfb_init(void)
284{
285	_CASSERT(SFBF_ECN4 == CLASSQF_ECN4);
286	_CASSERT(SFBF_ECN6 == CLASSQF_ECN6);
287
288	sfb_size = sizeof (struct sfb);
289	sfb_zone = zinit(sfb_size, SFB_ZONE_MAX * sfb_size,
290	    0, SFB_ZONE_NAME);
291	if (sfb_zone == NULL) {
292		panic("%s: failed allocating %s", __func__, SFB_ZONE_NAME);
293		/* NOTREACHED */
294	}
295	zone_change(sfb_zone, Z_EXPAND, TRUE);
296	zone_change(sfb_zone, Z_CALLERACCT, TRUE);
297
298	sfb_bins_size = sizeof (*((struct sfb *)0)->sfb_bins);
299	sfb_bins_zone = zinit(sfb_bins_size, SFB_BINS_ZONE_MAX * sfb_bins_size,
300	    0, SFB_BINS_ZONE_NAME);
301	if (sfb_bins_zone == NULL) {
302		panic("%s: failed allocating %s", __func__, SFB_BINS_ZONE_NAME);
303		/* NOTREACHED */
304	}
305	zone_change(sfb_bins_zone, Z_EXPAND, TRUE);
306	zone_change(sfb_bins_zone, Z_CALLERACCT, TRUE);
307
308	sfb_fcl_size = sizeof (*((struct sfb *)0)->sfb_fc_lists);
309	sfb_fcl_zone = zinit(sfb_fcl_size, SFB_FCL_ZONE_MAX * sfb_fcl_size,
310	    0, SFB_FCL_ZONE_NAME);
311	if (sfb_fcl_zone == NULL) {
312		panic("%s: failed allocating %s", __func__, SFB_FCL_ZONE_NAME);
313		/* NOTREACHED */
314	}
315	zone_change(sfb_fcl_zone, Z_EXPAND, TRUE);
316	zone_change(sfb_fcl_zone, Z_CALLERACCT, TRUE);
317}
318
319static u_int32_t
320sfb_random(struct sfb *sp)
321{
322	IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
323	return (RandomULong());
324}
325
326static void
327sfb_calc_holdtime(struct sfb *sp, u_int64_t outbw)
328{
329	u_int64_t holdtime;
330
331	if (sfb_holdtime != 0) {
332		holdtime = sfb_holdtime;
333	} else if (outbw == 0) {
334		holdtime = SFB_RANDOM(sp, HOLDTIME_MIN, HOLDTIME_MAX);
335	} else {
336		unsigned int n, i;
337
338		n = sfb_ttbl[0].holdtime;
339		for (i = 0; sfb_ttbl[i].speed != 0; i++) {
340			if (outbw < sfb_ttbl[i].speed)
341				break;
342			n = sfb_ttbl[i].holdtime;
343		}
344		holdtime = n;
345	}
346	net_nsectimer(&holdtime, &sp->sfb_holdtime);
347}
348
349static void
350sfb_calc_pboxtime(struct sfb *sp, u_int64_t outbw)
351{
352	u_int64_t pboxtime;
353
354	if (sfb_pboxtime != 0) {
355		pboxtime = sfb_pboxtime;
356	} else if (outbw == 0) {
357		pboxtime = SFB_RANDOM(sp, PBOXTIME_MIN, PBOXTIME_MAX);
358	} else {
359		unsigned int n, i;
360
361		n = sfb_ttbl[0].pboxtime;
362		for (i = 0; sfb_ttbl[i].speed != 0; i++) {
363			if (outbw < sfb_ttbl[i].speed)
364				break;
365			n = sfb_ttbl[i].pboxtime;
366		}
367		pboxtime = n;
368	}
369	net_nsectimer(&pboxtime, &sp->sfb_pboxtime);
370	net_timerclear(&sp->sfb_pboxfreeze);
371}
372
373static void
374sfb_calc_hinterval(struct sfb *sp, u_int64_t *t)
375{
376	u_int64_t hinterval;
377	struct timespec now;
378
379	if (t != NULL) {
380		/*
381		 * TODO adi@apple.com: use dq_avg to derive hinterval.
382		 */
383		hinterval = *t;
384	}
385
386	if (sfb_hinterval != 0)
387		hinterval = sfb_hinterval;
388	else if (t == NULL || hinterval == 0)
389		hinterval = ((u_int64_t)SFB_HINTERVAL(sp) * NSEC_PER_SEC);
390
391	net_nsectimer(&hinterval, &sp->sfb_hinterval);
392
393	nanouptime(&now);
394	net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset);
395}
396
397/*
398 * sfb support routines
399 */
400struct sfb *
401sfb_alloc(struct ifnet *ifp, u_int32_t qid, u_int32_t qlim, u_int32_t flags)
402{
403	struct sfb *sp;
404	int i;
405
406	VERIFY(ifp != NULL && qlim > 0);
407
408	sp = zalloc(sfb_zone);
409	if (sp == NULL) {
410		log(LOG_ERR, "%s: SFB unable to allocate\n", if_name(ifp));
411		return (NULL);
412	}
413	bzero(sp, sfb_size);
414
415	if ((sp->sfb_bins = zalloc(sfb_bins_zone)) == NULL) {
416		log(LOG_ERR, "%s: SFB unable to allocate bins\n", if_name(ifp));
417		sfb_destroy(sp);
418		return (NULL);
419	}
420	bzero(sp->sfb_bins, sfb_bins_size);
421
422	if ((sp->sfb_fc_lists = zalloc(sfb_fcl_zone)) == NULL) {
423		log(LOG_ERR, "%s: SFB unable to allocate flow control lists\n",
424		    if_name(ifp));
425		sfb_destroy(sp);
426		return(NULL);
427	}
428	bzero(sp->sfb_fc_lists, sfb_fcl_size);
429
430	for (i = 0; i < SFB_BINS; ++i)
431		STAILQ_INIT(&SFB_FC_LIST(sp, i)->fclist);
432
433	sp->sfb_ifp = ifp;
434	sp->sfb_qlim = qlim;
435	sp->sfb_qid = qid;
436	sp->sfb_flags = (flags & SFBF_USERFLAGS);
437#if !PF_ECN
438	if (sp->sfb_flags & SFBF_ECN) {
439		sp->sfb_flags &= ~SFBF_ECN;
440		log(LOG_ERR, "%s: SFB qid=%d, ECN not available; ignoring "
441		    "SFBF_ECN flag!\n", if_name(ifp), sp->sfb_qid);
442	}
443#endif /* !PF_ECN */
444
445	sfb_resetq(sp, -1);
446
447	return (sp);
448}
449
450static void
451sfb_fclist_append(struct sfb *sp, struct sfb_fcl *fcl)
452{
453	IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
454
455	VERIFY(STAILQ_EMPTY(&fcl->fclist) || fcl->cnt > 0);
456	sp->sfb_stats.flow_feedback += fcl->cnt;
457	fcl->cnt = 0;
458
459	flowadv_add(&fcl->fclist);
460	VERIFY(fcl->cnt == 0 && STAILQ_EMPTY(&fcl->fclist));
461}
462
463static void
464sfb_fclists_clean(struct sfb *sp)
465{
466	int i;
467
468	/* Move all the flow control entries to the flowadv list */
469	for (i = 0; i < SFB_BINS; ++i) {
470		struct sfb_fcl *fcl = SFB_FC_LIST(sp, i);
471		if (!STAILQ_EMPTY(&fcl->fclist))
472			sfb_fclist_append(sp, fcl);
473	}
474}
475
476void
477sfb_destroy(struct sfb *sp)
478{
479	sfb_fclists_clean(sp);
480	if (sp->sfb_bins != NULL) {
481		zfree(sfb_bins_zone, sp->sfb_bins);
482		sp->sfb_bins = NULL;
483	}
484	if (sp->sfb_fc_lists != NULL) {
485		zfree(sfb_fcl_zone, sp->sfb_fc_lists);
486		sp->sfb_fc_lists = NULL;
487	}
488	zfree(sfb_zone, sp);
489}
490
491static void
492sfb_resetq(struct sfb *sp, cqev_t ev)
493{
494	struct ifnet *ifp = sp->sfb_ifp;
495	u_int64_t eff_rate;
496
497	VERIFY(ifp != NULL);
498
499	if (ev != CLASSQ_EV_LINK_DOWN) {
500		(*sp->sfb_bins)[0].fudge = sfb_random(sp);
501		(*sp->sfb_bins)[1].fudge = sfb_random(sp);
502		sp->sfb_allocation = ((sfb_allocation == 0) ?
503		    (sp->sfb_qlim / 3) : sfb_allocation);
504		sp->sfb_drop_thresh = sp->sfb_allocation +
505		    (sp->sfb_allocation >> 1);
506	}
507
508	sp->sfb_clearpkts = 0;
509	sp->sfb_current = 0;
510
511	eff_rate = ifnet_output_linkrate(ifp);
512	sp->sfb_eff_rate = eff_rate;
513
514	sfb_calc_holdtime(sp, eff_rate);
515	sfb_calc_pboxtime(sp, eff_rate);
516	sfb_calc_hinterval(sp, NULL);
517
518	if (ev == CLASSQ_EV_LINK_DOWN ||
519		ev == CLASSQ_EV_LINK_UP)
520		sfb_fclists_clean(sp);
521
522	bzero(sp->sfb_bins, sizeof (*sp->sfb_bins));
523	bzero(&sp->sfb_stats, sizeof (sp->sfb_stats));
524
525	if (ev == CLASSQ_EV_LINK_DOWN || !classq_verbose)
526		return;
527
528	log(LOG_DEBUG, "%s: SFB qid=%d, holdtime=%llu nsec, "
529	    "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, "
530	    "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps\n",
531	    if_name(ifp), sp->sfb_qid, (u_int64_t)sp->sfb_holdtime.tv_nsec,
532	    (u_int64_t)sp->sfb_pboxtime.tv_nsec,
533	    (u_int32_t)sp->sfb_allocation, (u_int32_t)sp->sfb_drop_thresh,
534	    (int)sp->sfb_hinterval.tv_sec, (int)sizeof (*sp->sfb_bins),
535	    eff_rate);
536}
537
538void
539sfb_getstats(struct sfb *sp, struct sfb_stats *sps)
540{
541	sps->allocation = sp->sfb_allocation;
542	sps->dropthresh = sp->sfb_drop_thresh;
543	sps->clearpkts = sp->sfb_clearpkts;
544	sps->current = sp->sfb_current;
545
546	net_timernsec(&sp->sfb_holdtime, &sp->sfb_stats.hold_time);
547	net_timernsec(&sp->sfb_pboxtime, &sp->sfb_stats.pbox_time);
548	net_timernsec(&sp->sfb_hinterval, &sp->sfb_stats.rehash_intval);
549	*(&(sps->sfbstats)) = *(&(sp->sfb_stats));
550
551	_CASSERT(sizeof ((*sp->sfb_bins)[0].stats) ==
552	    sizeof (sps->binstats[0].stats));
553
554	bcopy(&(*sp->sfb_bins)[0].stats, &sps->binstats[0].stats,
555	    sizeof (sps->binstats[0].stats));
556	bcopy(&(*sp->sfb_bins)[1].stats, &sps->binstats[1].stats,
557	    sizeof (sps->binstats[1].stats));
558}
559
560static void
561sfb_swap_bins(struct sfb *sp, u_int32_t len)
562{
563	int i, j, s;
564
565	if (sp->sfb_flags & SFBF_SUSPENDED)
566		return;
567
568	s = sp->sfb_current;
569	VERIFY((s + (s ^ 1)) == 1);
570
571	(*sp->sfb_bins)[s].fudge = sfb_random(sp); /* recompute perturbation */
572	sp->sfb_clearpkts = len;
573	sp->sfb_stats.num_rehash++;
574
575	s = (sp->sfb_current ^= 1);	/* flip the bit (swap current) */
576
577	if (classq_verbose) {
578		log(LOG_DEBUG, "%s: SFB qid=%d, set %d is now current, "
579		    "qlen=%d\n", if_name(sp->sfb_ifp), sp->sfb_qid, s, len);
580	}
581
582	/* clear freezetime for all current bins */
583	bzero(&(*sp->sfb_bins)[s].freezetime,
584	    sizeof ((*sp->sfb_bins)[s].freezetime));
585
586	/* clear/adjust bin statistics and flow control lists */
587	for (i = 0; i < SFB_BINS; i++) {
588		struct sfb_fcl *fcl = SFB_FC_LIST(sp, i);
589
590		if (!STAILQ_EMPTY(&fcl->fclist))
591			sfb_fclist_append(sp, fcl);
592
593		for (j = 0; j < SFB_LEVELS; j++) {
594			struct sfbbinstats *cbin, *wbin;
595
596			cbin = SFB_BINST(sp, j, i, s);		/* current */
597			wbin = SFB_BINST(sp, j, i, s ^ 1);	/* warm-up */
598
599			cbin->pkts = 0;
600			if (cbin->pmark > SFB_MAX_PMARK)
601				cbin->pmark = SFB_MAX_PMARK;
602			if (cbin->pmark < 0)
603				cbin->pmark = 0;
604
605			/*
606			 * Keep pmark from before to identify
607			 * non-responsives immediately.
608			 */
609			if (wbin->pmark > SFB_PMARK_WARM)
610				wbin->pmark = SFB_PMARK_WARM;
611		}
612	}
613}
614
615static inline int
616sfb_pcheck(struct sfb *sp, struct pkthdr *pkt)
617{
618#if SFB_LEVELS != 2
619	int i, n;
620#endif /* SFB_LEVELS != 2 */
621	int s;
622
623	s = sp->sfb_current;
624	VERIFY((s + (s ^ 1)) == 1);
625
626	/*
627	 * For current bins, returns 1 if all pmark >= SFB_PMARK_TH,
628	 * 0 otherwise; optimize for SFB_LEVELS=2.
629	 */
630#if SFB_LEVELS == 2
631	/*
632	 * Level 0: bin index at [0] for set 0; [2] for set 1
633	 * Level 1: bin index at [1] for set 0; [3] for set 1
634	 */
635	if (SFB_BINST(sp, 0, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]),
636	    s)->pmark < SFB_PMARK_TH ||
637	    SFB_BINST(sp, 1, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]),
638	    s)->pmark < SFB_PMARK_TH)
639		return (0);
640#else /* SFB_LEVELS != 2 */
641	for (i = 0; i < SFB_LEVELS; i++) {
642		if (s == 0)		/* set 0, bin index [0,1] */
643			n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
644		else			/* set 1, bin index [2,3] */
645			n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
646
647		if (SFB_BINST(sp, i, n, s)->pmark < SFB_PMARK_TH)
648			return (0);
649	}
650#endif /* SFB_LEVELS != 2 */
651	return (1);
652}
653
654static int
655sfb_penalize(struct sfb *sp, struct pkthdr *pkt, struct timespec *now)
656{
657	struct timespec delta = { 0, 0 };
658
659	/* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */
660	if (!sfb_ratelimit || !sfb_pcheck(sp, pkt))
661		return (0);
662
663	net_timersub(now, &sp->sfb_pboxfreeze, &delta);
664	if (net_timercmp(&delta, &sp->sfb_pboxtime, <)) {
665#if SFB_LEVELS != 2
666		int i;
667#endif /* SFB_LEVELS != 2 */
668		struct sfbbinstats *bin;
669		int n, w;
670
671		w = sp->sfb_current ^ 1;
672		VERIFY((w + (w ^ 1)) == 1);
673
674		/*
675		 * Update warm-up bins; optimize for SFB_LEVELS=2
676		 */
677#if SFB_LEVELS == 2
678		/* Level 0: bin index at [0] for set 0; [2] for set 1 */
679		n = SFB_BINMASK(pkt->pkt_sfb_hash8[(w << 1)]);
680		bin = SFB_BINST(sp, 0, n, w);
681		if (bin->pkts >= sp->sfb_allocation)
682			sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, w), now);
683
684		/* Level 0: bin index at [1] for set 0; [3] for set 1 */
685		n = SFB_BINMASK(pkt->pkt_sfb_hash8[(w << 1) + 1]);
686		bin = SFB_BINST(sp, 1, n, w);
687		if (bin->pkts >= sp->sfb_allocation)
688			sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, w), now);
689#else /* SFB_LEVELS != 2 */
690		for (i = 0; i < SFB_LEVELS; i++) {
691			if (w == 0)	/* set 0, bin index [0,1] */
692				n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
693			else		/* set 1, bin index [2,3] */
694				n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
695
696			bin = SFB_BINST(sp, i, n, w);
697			if (bin->pkts >= sp->sfb_allocation) {
698				sfb_increment_bin(sp, bin,
699				    SFB_BINFT(sp, i, n, w), now);
700			}
701		}
702#endif /* SFB_LEVELS != 2 */
703		return (1);
704	}
705
706	/* non-conformant or else misclassified flow; queue it anyway */
707	pkt->pkt_sfb_flags |= SFB_PKT_PBOX;
708	*(&sp->sfb_pboxfreeze) = *now;
709
710	return (0);
711}
712
713static void
714sfb_adjust_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
715    struct timespec *now, boolean_t inc)
716{
717	struct timespec delta;
718
719	net_timersub(now, ft, &delta);
720	if (net_timercmp(&delta, &sp->sfb_holdtime, <)) {
721		if (classq_verbose > 1) {
722			log(LOG_DEBUG, "%s: SFB qid=%d, %s update frozen "
723			    "(delta=%llu nsec)\n", if_name(sp->sfb_ifp),
724			    sp->sfb_qid, inc ?  "increment" : "decrement",
725			    (u_int64_t)delta.tv_nsec);
726		}
727		return;
728	}
729
730	/* increment/decrement marking probability */
731	*ft = *now;
732	if (inc)
733		SFB_PMARK_INC(bin);
734	else
735		SFB_PMARK_DEC(bin);
736}
737
738static void
739sfb_decrement_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
740    struct timespec *now)
741{
742	return (sfb_adjust_bin(sp, bin, ft, now, FALSE));
743}
744
745static void
746sfb_increment_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft,
747    struct timespec *now)
748{
749	return (sfb_adjust_bin(sp, bin, ft, now, TRUE));
750}
751
752static inline void
753sfb_dq_update_bins(struct sfb *sp, struct pkthdr *pkt, struct timespec *now)
754{
755#if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0
756	int i;
757#endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
758	struct sfbbinstats *bin;
759	int s, n;
760	struct sfb_fcl *fcl = NULL;
761
762	s = sp->sfb_current;
763	VERIFY((s + (s ^ 1)) == 1);
764
765	/*
766	 * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0
767	 */
768#if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0
769	/* Level 0: bin index at [0] for set 0; [2] for set 1 */
770	n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]);
771	bin = SFB_BINST(sp, 0, n, s);
772
773	VERIFY(bin->pkts > 0);
774	if (--bin->pkts == 0) {
775		sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now);
776	}
777	if (bin->pkts <= (sp->sfb_allocation >> 2)) {
778		/* deliver flow control feedback to the sockets */
779		fcl = SFB_FC_LIST(sp, n);
780		if (!STAILQ_EMPTY(&fcl->fclist))
781			sfb_fclist_append(sp, fcl);
782	}
783
784	/* Level 1: bin index at [1] for set 0; [3] for set 1 */
785	n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]);
786	bin = SFB_BINST(sp, 1, n, s);
787
788	VERIFY(bin->pkts > 0);
789	if (--bin->pkts == 0)
790		sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now);
791#else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
792	for (i = 0; i < SFB_LEVELS; i++) {
793		if (s == 0)		/* set 0, bin index [0,1] */
794			n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
795		else			/* set 1, bin index [2,3] */
796			n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
797
798		bin = SFB_BINST(sp, i, n, s);
799
800		VERIFY(bin->pkts > 0);
801		if (--bin->pkts == 0) {
802			sfb_decrement_bin(sp, bin,
803			    SFB_BINFT(sp, i, n, s), now);
804		}
805		if (bin->pkts <= (sp->sfb_allocation >> 2)) {
806			/* deliver flow control feedback to the sockets */
807			if (i == SFB_FC_LEVEL) {
808				fcl = SFB_FC_LIST(sp, n);
809				if (!STAILQ_EMPTY(&fcl->fclist))
810					sfb_fclist_append(sp, fcl);
811			}
812		}
813	}
814#endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */
815}
816
817static inline void
818sfb_eq_update_bins(struct sfb *sp, struct pkthdr *pkt)
819{
820#if SFB_LEVELS != 2
821	int i, n;
822#endif /* SFB_LEVELS != 2 */
823	int s;
824
825	s = sp->sfb_current;
826	VERIFY((s + (s ^ 1)) == 1);
827
828	/*
829	 * Update current bins; optimize for SFB_LEVELS=2
830	 */
831#if SFB_LEVELS == 2
832	/* Level 0: bin index at [0] for set 0; [2] for set 1 */
833	SFB_BINST(sp, 0,
834	    SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]), s)->pkts++;
835
836	/* Level 1: bin index at [1] for set 0; [3] for set 1 */
837	SFB_BINST(sp, 1,
838	    SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]), s)->pkts++;
839#else /* SFB_LEVELS != 2 */
840	for (i = 0; i < SFB_LEVELS; i++) {
841		if (s == 0)		/* set 0, bin index [0,1] */
842			n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
843		else			/* set 1, bin index [2,3] */
844			n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
845
846		SFB_BINST(sp, i, n, s)->pkts++;
847	}
848#endif /* SFB_LEVELS != 2 */
849}
850
851static boolean_t
852sfb_bin_addfcentry(struct sfb *sp, struct pkthdr *pkt)
853{
854	struct flowadv_fcentry *fce;
855	u_int32_t flowsrc, flowid;
856	struct sfb_fcl *fcl;
857	int s;
858
859	s = sp->sfb_current;
860	VERIFY((s + (s ^ 1)) == 1);
861
862	flowsrc = pkt->pkt_flowsrc;
863	flowid = pkt->pkt_flowid;
864
865	if (flowid == 0) {
866		sp->sfb_stats.null_flowid++;
867		return (FALSE);
868	}
869
870	/*
871	 * Use value at index 0 for set 0 and
872	 * value at index 2 for set 1
873	 */
874	fcl = SFB_FC_LIST(sp, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]));
875	STAILQ_FOREACH(fce, &fcl->fclist, fce_link) {
876		if (fce->fce_flowsrc == flowsrc &&
877		    fce->fce_flowid == flowid) {
878			/* Already on flow control list; just return */
879			return (TRUE);
880		}
881	}
882
883	IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
884	fce = flowadv_alloc_entry(M_WAITOK);
885	if (fce != NULL) {
886		fce->fce_flowsrc = flowsrc;
887		fce->fce_flowid = flowid;
888		STAILQ_INSERT_TAIL(&fcl->fclist, fce, fce_link);
889		fcl->cnt++;
890		sp->sfb_stats.flow_controlled++;
891	}
892
893	return (fce != NULL);
894}
895
896/*
897 * early-drop probability is kept in pmark of each bin of the flow
898 */
899static int
900sfb_drop_early(struct sfb *sp, struct pkthdr *pkt, u_int16_t *pmin,
901    struct timespec *now)
902{
903#if SFB_LEVELS != 2
904	int i;
905#endif /* SFB_LEVELS != 2 */
906	struct sfbbinstats *bin;
907	int s, n, ret = 0;
908
909	s = sp->sfb_current;
910	VERIFY((s + (s ^ 1)) == 1);
911
912	*pmin = (u_int16_t)-1;
913
914	/*
915	 * Update current bins; optimize for SFB_LEVELS=2
916	 */
917#if SFB_LEVELS == 2
918	/* Level 0: bin index at [0] for set 0; [2] for set 1 */
919	n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]);
920	bin = SFB_BINST(sp, 0, n, s);
921	if (*pmin > (u_int16_t)bin->pmark)
922		*pmin = (u_int16_t)bin->pmark;
923
924	if (bin->pkts >= sp->sfb_allocation) {
925		if (bin->pkts >= sp->sfb_drop_thresh)
926			ret = 1;	/* drop or mark */
927		sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now);
928	}
929
930	/* Level 1: bin index at [1] for set 0; [3] for set 1 */
931	n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]);
932	bin = SFB_BINST(sp, 1, n, s);
933	if (*pmin > (u_int16_t)bin->pmark)
934		*pmin = (u_int16_t)bin->pmark;
935
936	if (bin->pkts >= sp->sfb_allocation) {
937		if (bin->pkts >= sp->sfb_drop_thresh)
938			ret = 1;	/* drop or mark */
939		sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now);
940	}
941#else /* SFB_LEVELS != 2 */
942	for (i = 0; i < SFB_LEVELS; i++) {
943		if (s == 0)		/* set 0, bin index [0,1] */
944			n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]);
945		else			/* set 1, bin index [2,3] */
946			n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]);
947
948		bin = SFB_BINST(sp, i, n, s);
949		if (*pmin > (u_int16_t)bin->pmark)
950			*pmin = (u_int16_t)bin->pmark;
951
952		if (bin->pkts >= sp->sfb_allocation) {
953			if (bin->pkts >= sp->sfb_drop_thresh)
954				ret = 1;	/* drop or mark */
955			sfb_increment_bin(sp, bin,
956			    SFB_BINFT(sp, i, n, s), now);
957		}
958	}
959#endif /* SFB_LEVELS != 2 */
960
961	if (sp->sfb_flags & SFBF_SUSPENDED)
962		ret = 1;	/* drop or mark */
963
964	return (ret);
965}
966
967#define	DTYPE_NODROP	0	/* no drop */
968#define	DTYPE_FORCED	1	/* a "forced" drop */
969#define	DTYPE_EARLY	2	/* an "unforced" (early) drop */
970
971int
972sfb_addq(struct sfb *sp, class_queue_t *q, struct mbuf *m, struct pf_mtag *t)
973{
974#if !PF_ECN
975#pragma unused(t)
976#endif /* !PF_ECN */
977	struct pkthdr *pkt = &m->m_pkthdr;
978	struct timespec now;
979	int droptype, s;
980	u_int16_t pmin;
981	int fc_adv = 0;
982	int ret = CLASSQEQ_SUCCESS;
983
984	nanouptime(&now);
985
986	s = sp->sfb_current;
987	VERIFY((s + (s ^ 1)) == 1);
988
989	/* time to swap the bins? */
990	if (net_timercmp(&now, &sp->sfb_nextreset, >=)) {
991		net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset);
992		sfb_swap_bins(sp, qlen(q));
993		s = sp->sfb_current;
994		VERIFY((s + (s ^ 1)) == 1);
995	}
996
997	pkt->pkt_sfb_flags = 0;
998	pkt->pkt_sfb_hash16[s] =
999	    (SFB_HASH(&pkt->pkt_flowid, sizeof (pkt->pkt_flowid),
1000	    (*sp->sfb_bins)[s].fudge) & SFB_HASHMASK);
1001	pkt->pkt_sfb_hash16[s ^ 1] =
1002	    (SFB_HASH(&pkt->pkt_flowid, sizeof (pkt->pkt_flowid),
1003	    (*sp->sfb_bins)[s ^ 1].fudge) & SFB_HASHMASK);
1004
1005	/* see if we drop early */
1006	droptype = DTYPE_NODROP;
1007	if (sfb_drop_early(sp, pkt, &pmin, &now)) {
1008		/* flow control, mark or drop by sfb */
1009		if ((sp->sfb_flags & SFBF_FLOWCTL) &&
1010		    (pkt->pkt_flags & PKTF_FLOW_ADV)) {
1011			fc_adv = 1;
1012			/* drop all during suspension or for non-TCP */
1013			if ((sp->sfb_flags & SFBF_SUSPENDED) ||
1014			    pkt->pkt_proto != IPPROTO_TCP) {
1015				droptype = DTYPE_EARLY;
1016				sp->sfb_stats.drop_early++;
1017			}
1018		}
1019#if PF_ECN
1020		else if ((sp->sfb_flags & SFBF_ECN) &&
1021		    (pkt->pkt_proto == IPPROTO_TCP) && /* only for TCP */
1022		    ((sfb_random(sp) & SFB_MAX_PMARK) <= pmin) &&
1023		    mark_ecn(m, t, sp->sfb_flags) &&
1024		    !(sp->sfb_flags & SFBF_SUSPENDED)) {
1025			/* successfully marked; do not drop. */
1026			sp->sfb_stats.marked_packets++;
1027		}
1028#endif /* PF_ECN */
1029		else {
1030			/* unforced drop by sfb */
1031			droptype = DTYPE_EARLY;
1032			sp->sfb_stats.drop_early++;
1033		}
1034	}
1035
1036	/* non-responsive flow penalty? */
1037	if (droptype == DTYPE_NODROP && sfb_penalize(sp, pkt, &now)) {
1038		droptype = DTYPE_FORCED;
1039		sp->sfb_stats.drop_pbox++;
1040	}
1041
1042	/* if the queue length hits the hard limit, it's a forced drop */
1043	if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) {
1044		droptype = DTYPE_FORCED;
1045		sp->sfb_stats.drop_queue++;
1046	}
1047
1048	if (fc_adv == 1 && droptype != DTYPE_FORCED &&
1049	    sfb_bin_addfcentry(sp, pkt)) {
1050		/* deliver flow control advisory error */
1051		if (droptype == DTYPE_NODROP) {
1052			ret = CLASSQEQ_SUCCESS_FC;
1053			VERIFY(!(sp->sfb_flags & SFBF_SUSPENDED));
1054		} else if (sp->sfb_flags & SFBF_SUSPENDED) {
1055			/* dropped due to suspension */
1056			ret = CLASSQEQ_DROPPED_SP;
1057		} else {
1058			/* dropped due to flow-control */
1059			ret = CLASSQEQ_DROPPED_FC;
1060		}
1061	}
1062
1063	/* if successful enqueue this packet, else drop it */
1064	if (droptype == DTYPE_NODROP) {
1065		_addq(q, m);
1066	} else {
1067		IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
1068		m_freem(m);
1069		return ((ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROPPED);
1070	}
1071
1072	if (!(pkt->pkt_sfb_flags & SFB_PKT_PBOX))
1073		sfb_eq_update_bins(sp, pkt);
1074	else
1075		sp->sfb_stats.pbox_packets++;
1076
1077	/* successfully queued */
1078	return (ret);
1079}
1080
1081static struct mbuf *
1082sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge)
1083{
1084	struct timespec now;
1085	struct mbuf *m;
1086	struct pkthdr *pkt;
1087
1088	if (!purge && (sp->sfb_flags & SFBF_SUSPENDED))
1089		return (NULL);
1090
1091	nanouptime(&now);
1092
1093	/* flow of 0 means head of queue */
1094	if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) {
1095		if (!purge)
1096			net_timerclear(&sp->sfb_getqtime);
1097		return (NULL);
1098	}
1099
1100	VERIFY(m->m_flags & M_PKTHDR);
1101
1102	pkt = &m->m_pkthdr;
1103
1104	if (!purge) {
1105		/* calculate EWMA of dequeues */
1106		if (net_timerisset(&sp->sfb_getqtime)) {
1107			struct timespec delta;
1108			u_int64_t avg, new;
1109
1110			net_timersub(&now, &sp->sfb_getqtime, &delta);
1111			net_timernsec(&delta, &new);
1112			avg = sp->sfb_stats.dequeue_avg;
1113			if (avg > 0) {
1114				int decay = DEQUEUE_DECAY;
1115				/*
1116				 * If the time since last dequeue is
1117				 * significantly greater than the current
1118				 * average, weight the average more against
1119				 * the old value.
1120				 */
1121				if (DEQUEUE_SPIKE(new, avg))
1122					decay += 5;
1123				avg = (((avg << decay) - avg) + new) >> decay;
1124			} else {
1125				avg = new;
1126			}
1127			sp->sfb_stats.dequeue_avg = avg;
1128		}
1129		*(&sp->sfb_getqtime) = *(&now);
1130	}
1131
1132	/*
1133	 * Clearpkts are the ones which were in the queue when the hash
1134	 * function was perturbed.  Since the perturbation value (fudge),
1135	 * and thus bin information for these packets is not known, we do
1136	 * not change accounting information while dequeuing these packets.
1137	 * It is important not to set the hash interval too small due to
1138	 * this reason.  A rule of thumb is to set it to K*D, where D is
1139	 * the time taken to drain queue.
1140	 */
1141	if (pkt->pkt_sfb_flags & SFB_PKT_PBOX) {
1142		pkt->pkt_sfb_flags &= ~SFB_PKT_PBOX;
1143		if (sp->sfb_clearpkts > 0)
1144			sp->sfb_clearpkts--;
1145	} else if (sp->sfb_clearpkts > 0) {
1146		sp->sfb_clearpkts--;
1147	} else {
1148		sfb_dq_update_bins(sp, pkt, &now);
1149	}
1150
1151	return (m);
1152}
1153
1154struct mbuf *
1155sfb_getq(struct sfb *sp, class_queue_t *q)
1156{
1157	return (sfb_getq_flow(sp, q, 0, FALSE));
1158}
1159
1160void
1161sfb_purgeq(struct sfb *sp, class_queue_t *q, u_int32_t flow, u_int32_t *packets,
1162    u_int32_t *bytes)
1163{
1164	u_int32_t cnt = 0, len = 0;
1165	struct mbuf *m;
1166
1167	IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd);
1168
1169	while ((m = sfb_getq_flow(sp, q, flow, TRUE)) != NULL) {
1170		cnt++;
1171		len += m_pktlen(m);
1172		m_freem(m);
1173	}
1174
1175	if (packets != NULL)
1176		*packets = cnt;
1177	if (bytes != NULL)
1178		*bytes = len;
1179}
1180
1181void
1182sfb_updateq(struct sfb *sp, cqev_t ev)
1183{
1184	struct ifnet *ifp = sp->sfb_ifp;
1185
1186	VERIFY(ifp != NULL);
1187
1188	switch (ev) {
1189	case CLASSQ_EV_LINK_BANDWIDTH: {
1190		u_int64_t eff_rate = ifnet_output_linkrate(ifp);
1191
1192		/* update parameters only if rate has changed */
1193		if (eff_rate == sp->sfb_eff_rate)
1194			break;
1195
1196		if (classq_verbose) {
1197			log(LOG_DEBUG, "%s: SFB qid=%d, adapting to new "
1198			    "eff_rate=%llu bps\n", if_name(ifp), sp->sfb_qid,
1199			    eff_rate);
1200		}
1201		sfb_calc_holdtime(sp, eff_rate);
1202		sfb_calc_pboxtime(sp, eff_rate);
1203		break;
1204	}
1205
1206	case CLASSQ_EV_LINK_UP:
1207	case CLASSQ_EV_LINK_DOWN:
1208		if (classq_verbose) {
1209			log(LOG_DEBUG, "%s: SFB qid=%d, resetting due to "
1210			    "link %s\n", if_name(ifp), sp->sfb_qid,
1211			    (ev == CLASSQ_EV_LINK_UP) ? "UP" : "DOWN");
1212		}
1213		sfb_resetq(sp, ev);
1214		break;
1215
1216	case CLASSQ_EV_LINK_LATENCY:
1217	case CLASSQ_EV_LINK_MTU:
1218	default:
1219		break;
1220	}
1221}
1222
1223int
1224sfb_suspendq(struct sfb *sp, class_queue_t *q, boolean_t on)
1225{
1226#pragma unused(q)
1227	struct ifnet *ifp = sp->sfb_ifp;
1228
1229	VERIFY(ifp != NULL);
1230
1231	if ((on && (sp->sfb_flags & SFBF_SUSPENDED)) ||
1232	    (!on && !(sp->sfb_flags & SFBF_SUSPENDED)))
1233		return (0);
1234
1235	if (!(sp->sfb_flags & SFBF_FLOWCTL)) {
1236		log(LOG_ERR, "%s: SFB qid=%d, unable to %s queue since "
1237		    "flow-control is not enabled", if_name(ifp), sp->sfb_qid,
1238		    (on ? "suspend" : "resume"));
1239		return (ENOTSUP);
1240	}
1241
1242	if (classq_verbose) {
1243		log(LOG_DEBUG, "%s: SFB qid=%d, setting state to %s",
1244		    if_name(ifp), sp->sfb_qid, (on ? "SUSPENDED" : "RUNNING"));
1245	}
1246
1247	if (on) {
1248		sp->sfb_flags |= SFBF_SUSPENDED;
1249	} else {
1250		sp->sfb_flags &= ~SFBF_SUSPENDED;
1251		sfb_swap_bins(sp, qlen(q));
1252	}
1253
1254	return (0);
1255}
1256