1/* 2 * Copyright (c) 2011-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <sys/cdefs.h> 30#include <sys/param.h> 31#include <sys/mbuf.h> 32#include <sys/socket.h> 33#include <sys/sockio.h> 34#include <sys/systm.h> 35#include <sys/sysctl.h> 36#include <sys/syslog.h> 37#include <sys/proc.h> 38#include <sys/errno.h> 39#include <sys/kernel.h> 40#include <sys/kauth.h> 41 42#include <kern/zalloc.h> 43 44#include <net/if.h> 45#include <net/if_var.h> 46#include <net/if_types.h> 47#include <net/dlil.h> 48#include <net/flowadv.h> 49 50#include <netinet/in.h> 51#include <netinet/in_systm.h> 52#include <netinet/ip.h> 53#if INET6 54#include <netinet/ip6.h> 55#endif 56 57#include <net/classq/classq_sfb.h> 58#include <net/flowhash.h> 59#include <net/net_osdep.h> 60#include <dev/random/randomdev.h> 61 62/* 63 * Stochastic Fair Blue 64 * 65 * Wu-chang Feng, Dilip D. Kandlur, Debanjan Saha, Kang G. Shin 66 * http://www.thefengs.com/wuchang/blue/CSE-TR-387-99.pdf 67 * 68 * Based on the NS code with the following parameters: 69 * 70 * bytes: false 71 * decrement: 0.001 72 * increment: 0.005 73 * hold-time: 10ms-50ms (randomized) 74 * algorithm: 0 75 * pbox: 1 76 * pbox-time: 50-100ms (randomized) 77 * hinterval: 11-23 (randomized) 78 * 79 * This implementation uses L = 2 and N = 32 for 2 sets of: 80 * 81 * B[L][N]: L x N array of bins (L levels, N bins per level) 82 * 83 * Each set effectively creates 32^2 virtual buckets (bin combinations) 84 * while using only O(32*2) states. 85 * 86 * Given a 32-bit hash value, we divide it such that octets [0,1,2,3] are 87 * used as index for the bins across the 2 levels, where level 1 uses [0,2] 88 * and level 2 uses [1,3]. The 2 values per level correspond to the indices 89 * for the current and warm-up sets (section 4.4. in the SFB paper regarding 90 * Moving Hash Functions explains the purposes of these 2 sets.) 91 */ 92 93/* 94 * Use Murmur3A_x86_32 for hash function. It seems to perform consistently 95 * across platforms for 1-word key (32-bit flowhash value). See flowhash.h 96 * for other alternatives. We only need 16-bit hash output. 97 */ 98#define SFB_HASH net_flowhash_mh3_x86_32 99#define SFB_HASHMASK HASHMASK(16) 100 101#define SFB_BINMASK(_x) \ 102 ((_x) & HASHMASK(SFB_BINS_SHIFT)) 103 104#define SFB_BINST(_sp, _l, _n, _c) \ 105 (&(*(_sp)->sfb_bins)[_c].stats[_l][_n]) 106 107#define SFB_BINFT(_sp, _l, _n, _c) \ 108 (&(*(_sp)->sfb_bins)[_c].freezetime[_l][_n]) 109 110#define SFB_FC_LIST(_sp, _n) \ 111 (&(*(_sp)->sfb_fc_lists)[_n]) 112 113/* 114 * The holdtime parameter determines the minimum time interval between 115 * two successive updates of the marking probability. In the event the 116 * uplink speed is not known, a default value is chosen and is randomized 117 * to be within the following range. 118 */ 119#define HOLDTIME_BASE (100ULL * 1000 * 1000) /* 100ms */ 120#define HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10ms */ 121#define HOLDTIME_MAX (100ULL * 1000 * 1000) /* 100ms */ 122 123/* 124 * The pboxtime parameter determines the bandwidth allocated for rogue 125 * flows, i.e. the rate limiting bandwidth. In the event the uplink speed 126 * is not known, a default value is chosen and is randomized to be within 127 * the following range. 128 */ 129#define PBOXTIME_BASE (300ULL * 1000 * 1000) /* 300ms */ 130#define PBOXTIME_MIN (30ULL * 1000 * 1000) /* 30ms */ 131#define PBOXTIME_MAX (300ULL * 1000 * 1000) /* 300ms */ 132 133#define SFB_RANDOM(sp, tmin, tmax) ((sfb_random(sp) % (tmax)) + (tmin)) 134 135#define SFB_PKT_PBOX 0x1 /* in penalty box */ 136 137/* The following mantissa values are in SFB_FP_SHIFT Q format */ 138#define SFB_MAX_PMARK (1 << SFB_FP_SHIFT) /* Q14 representation of 1.00 */ 139 140/* 141 * These are d1 (increment) and d2 (decrement) parameters, used to determine 142 * the amount by which the marking probability is incremented when the queue 143 * overflows, or is decremented when the link is idle. d1 is set higher than 144 * d2, because link underutilization can occur when congestion management is 145 * either too conservative or too aggressive, but packet loss occurs only 146 * when congestion management is too conservative. By weighing heavily 147 * against packet loss, it can quickly reach to a substantial increase in 148 * traffic load. 149 */ 150#define SFB_INCREMENT 82 /* Q14 representation of 0.005 */ 151#define SFB_DECREMENT 16 /* Q14 representation of 0.001 */ 152 153#define SFB_PMARK_TH 16056 /* Q14 representation of 0.98 */ 154#define SFB_PMARK_WARM 3276 /* Q14 representation of 0.2 */ 155 156#define SFB_PMARK_INC(_bin) do { \ 157 (_bin)->pmark += sfb_increment; \ 158 if ((_bin)->pmark > SFB_MAX_PMARK) \ 159 (_bin)->pmark = SFB_MAX_PMARK; \ 160} while (0) 161 162#define SFB_PMARK_DEC(_bin) do { \ 163 if ((_bin)->pmark > 0) { \ 164 (_bin)->pmark -= sfb_decrement; \ 165 if ((_bin)->pmark < 0) \ 166 (_bin)->pmark = 0; \ 167 } \ 168} while (0) 169 170#define HINTERVAL_MIN (10) /* 10 seconds */ 171#define HINTERVAL_MAX (20) /* 20 seconds */ 172#define SFB_HINTERVAL(sp) ((sfb_random(sp) % HINTERVAL_MAX) + HINTERVAL_MIN) 173 174#define DEQUEUE_DECAY 7 /* ilog2 of EWMA decay rate, (128) */ 175#define DEQUEUE_SPIKE(_new, _old) \ 176 ((u_int64_t)ABS((int64_t)(_new) - (int64_t)(_old)) > ((_old) << 11)) 177 178#define ABS(v) (((v) > 0) ? (v) : -(v)) 179 180#define SFB_ZONE_MAX 32 /* maximum elements in zone */ 181#define SFB_ZONE_NAME "classq_sfb" /* zone name */ 182 183#define SFB_BINS_ZONE_MAX 32 /* maximum elements in zone */ 184#define SFB_BINS_ZONE_NAME "classq_sfb_bins" /* zone name */ 185 186#define SFB_FCL_ZONE_MAX 32 /* maximum elements in zone */ 187#define SFB_FCL_ZONE_NAME "classq_sfb_fcl" /* zone name */ 188 189/* Place the flow control entries in current bin on level 0 */ 190#define SFB_FC_LEVEL 0 191 192/* Store SFB hash and flags in the module private scratch space */ 193#define pkt_sfb_hash8 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val8 194#define pkt_sfb_hash16 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val16 195#define pkt_sfb_hash32 pkt_mpriv.__mpriv_u.__mpriv32[0].__mpriv32_u.__val32 196#define pkt_sfb_flags pkt_mpriv.__mpriv_u.__mpriv32[1].__mpriv32_u.__val32 197 198static unsigned int sfb_size; /* size of zone element */ 199static struct zone *sfb_zone; /* zone for sfb */ 200 201static unsigned int sfb_bins_size; /* size of zone element */ 202static struct zone *sfb_bins_zone; /* zone for sfb_bins */ 203 204static unsigned int sfb_fcl_size; /* size of zone element */ 205static struct zone *sfb_fcl_zone; /* zone for sfb_fc_lists */ 206 207/* internal function prototypes */ 208static u_int32_t sfb_random(struct sfb *); 209static struct mbuf *sfb_getq_flow(struct sfb *, class_queue_t *, u_int32_t, 210 boolean_t); 211static void sfb_resetq(struct sfb *, cqev_t); 212static void sfb_calc_holdtime(struct sfb *, u_int64_t); 213static void sfb_calc_pboxtime(struct sfb *, u_int64_t); 214static void sfb_calc_hinterval(struct sfb *, u_int64_t *); 215static void sfb_swap_bins(struct sfb *, u_int32_t); 216static inline int sfb_pcheck(struct sfb *, struct pkthdr *); 217static int sfb_penalize(struct sfb *, struct pkthdr *, struct timespec *); 218static void sfb_adjust_bin(struct sfb *, struct sfbbinstats *, 219 struct timespec *, struct timespec *, boolean_t); 220static void sfb_decrement_bin(struct sfb *, struct sfbbinstats *, 221 struct timespec *, struct timespec *); 222static void sfb_increment_bin(struct sfb *, struct sfbbinstats *, 223 struct timespec *, struct timespec *); 224static inline void sfb_dq_update_bins(struct sfb *, struct pkthdr *, 225 struct timespec *); 226static inline void sfb_eq_update_bins(struct sfb *, struct pkthdr *); 227static int sfb_drop_early(struct sfb *, struct pkthdr *, u_int16_t *, 228 struct timespec *); 229static boolean_t sfb_bin_addfcentry(struct sfb *, struct pkthdr *); 230static void sfb_fclist_append(struct sfb *, struct sfb_fcl *); 231static void sfb_fclists_clean(struct sfb *sp); 232 233SYSCTL_NODE(_net_classq, OID_AUTO, sfb, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "SFB"); 234 235static u_int64_t sfb_holdtime = 0; /* 0 indicates "automatic" */ 236SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, holdtime, CTLFLAG_RW|CTLFLAG_LOCKED, 237 &sfb_holdtime, "SFB freeze time in nanoseconds"); 238 239static u_int64_t sfb_pboxtime = 0; /* 0 indicates "automatic" */ 240SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, pboxtime, CTLFLAG_RW|CTLFLAG_LOCKED, 241 &sfb_pboxtime, "SFB penalty box time in nanoseconds"); 242 243static u_int64_t sfb_hinterval; 244SYSCTL_QUAD(_net_classq_sfb, OID_AUTO, hinterval, CTLFLAG_RW|CTLFLAG_LOCKED, 245 &sfb_hinterval, "SFB hash interval in nanoseconds"); 246 247static u_int32_t sfb_increment = SFB_INCREMENT; 248SYSCTL_UINT(_net_classq_sfb, OID_AUTO, increment, CTLFLAG_RW|CTLFLAG_LOCKED, 249 &sfb_increment, SFB_INCREMENT, "SFB increment [d1]"); 250 251static u_int32_t sfb_decrement = SFB_DECREMENT; 252SYSCTL_UINT(_net_classq_sfb, OID_AUTO, decrement, CTLFLAG_RW|CTLFLAG_LOCKED, 253 &sfb_decrement, SFB_DECREMENT, "SFB decrement [d2]"); 254 255static u_int32_t sfb_allocation = 0; /* 0 means "automatic" */ 256SYSCTL_UINT(_net_classq_sfb, OID_AUTO, allocation, CTLFLAG_RW|CTLFLAG_LOCKED, 257 &sfb_allocation, 0, "SFB bin allocation"); 258 259static u_int32_t sfb_ratelimit = 0; 260SYSCTL_UINT(_net_classq_sfb, OID_AUTO, ratelimit, CTLFLAG_RW|CTLFLAG_LOCKED, 261 &sfb_ratelimit, 0, "SFB rate limit"); 262 263#define MBPS (1ULL * 1000 * 1000) 264#define GBPS (MBPS * 1000) 265 266struct sfb_time_tbl { 267 u_int64_t speed; /* uplink speed */ 268 u_int64_t holdtime; /* hold time */ 269 u_int64_t pboxtime; /* penalty box time */ 270}; 271 272static struct sfb_time_tbl sfb_ttbl[] = { 273 { 1 * MBPS, HOLDTIME_BASE * 1000, PBOXTIME_BASE * 1000 }, 274 { 10 * MBPS, HOLDTIME_BASE * 100, PBOXTIME_BASE * 100 }, 275 { 100 * MBPS, HOLDTIME_BASE * 10, PBOXTIME_BASE * 10 }, 276 { 1 * GBPS, HOLDTIME_BASE, PBOXTIME_BASE }, 277 { 10 * GBPS, HOLDTIME_BASE / 10, PBOXTIME_BASE / 10 }, 278 { 100 * GBPS, HOLDTIME_BASE / 100, PBOXTIME_BASE / 100 }, 279 { 0, 0, 0 } 280}; 281 282void 283sfb_init(void) 284{ 285 _CASSERT(SFBF_ECN4 == CLASSQF_ECN4); 286 _CASSERT(SFBF_ECN6 == CLASSQF_ECN6); 287 288 sfb_size = sizeof (struct sfb); 289 sfb_zone = zinit(sfb_size, SFB_ZONE_MAX * sfb_size, 290 0, SFB_ZONE_NAME); 291 if (sfb_zone == NULL) { 292 panic("%s: failed allocating %s", __func__, SFB_ZONE_NAME); 293 /* NOTREACHED */ 294 } 295 zone_change(sfb_zone, Z_EXPAND, TRUE); 296 zone_change(sfb_zone, Z_CALLERACCT, TRUE); 297 298 sfb_bins_size = sizeof (*((struct sfb *)0)->sfb_bins); 299 sfb_bins_zone = zinit(sfb_bins_size, SFB_BINS_ZONE_MAX * sfb_bins_size, 300 0, SFB_BINS_ZONE_NAME); 301 if (sfb_bins_zone == NULL) { 302 panic("%s: failed allocating %s", __func__, SFB_BINS_ZONE_NAME); 303 /* NOTREACHED */ 304 } 305 zone_change(sfb_bins_zone, Z_EXPAND, TRUE); 306 zone_change(sfb_bins_zone, Z_CALLERACCT, TRUE); 307 308 sfb_fcl_size = sizeof (*((struct sfb *)0)->sfb_fc_lists); 309 sfb_fcl_zone = zinit(sfb_fcl_size, SFB_FCL_ZONE_MAX * sfb_fcl_size, 310 0, SFB_FCL_ZONE_NAME); 311 if (sfb_fcl_zone == NULL) { 312 panic("%s: failed allocating %s", __func__, SFB_FCL_ZONE_NAME); 313 /* NOTREACHED */ 314 } 315 zone_change(sfb_fcl_zone, Z_EXPAND, TRUE); 316 zone_change(sfb_fcl_zone, Z_CALLERACCT, TRUE); 317} 318 319static u_int32_t 320sfb_random(struct sfb *sp) 321{ 322 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); 323 return (RandomULong()); 324} 325 326static void 327sfb_calc_holdtime(struct sfb *sp, u_int64_t outbw) 328{ 329 u_int64_t holdtime; 330 331 if (sfb_holdtime != 0) { 332 holdtime = sfb_holdtime; 333 } else if (outbw == 0) { 334 holdtime = SFB_RANDOM(sp, HOLDTIME_MIN, HOLDTIME_MAX); 335 } else { 336 unsigned int n, i; 337 338 n = sfb_ttbl[0].holdtime; 339 for (i = 0; sfb_ttbl[i].speed != 0; i++) { 340 if (outbw < sfb_ttbl[i].speed) 341 break; 342 n = sfb_ttbl[i].holdtime; 343 } 344 holdtime = n; 345 } 346 net_nsectimer(&holdtime, &sp->sfb_holdtime); 347} 348 349static void 350sfb_calc_pboxtime(struct sfb *sp, u_int64_t outbw) 351{ 352 u_int64_t pboxtime; 353 354 if (sfb_pboxtime != 0) { 355 pboxtime = sfb_pboxtime; 356 } else if (outbw == 0) { 357 pboxtime = SFB_RANDOM(sp, PBOXTIME_MIN, PBOXTIME_MAX); 358 } else { 359 unsigned int n, i; 360 361 n = sfb_ttbl[0].pboxtime; 362 for (i = 0; sfb_ttbl[i].speed != 0; i++) { 363 if (outbw < sfb_ttbl[i].speed) 364 break; 365 n = sfb_ttbl[i].pboxtime; 366 } 367 pboxtime = n; 368 } 369 net_nsectimer(&pboxtime, &sp->sfb_pboxtime); 370 net_timerclear(&sp->sfb_pboxfreeze); 371} 372 373static void 374sfb_calc_hinterval(struct sfb *sp, u_int64_t *t) 375{ 376 u_int64_t hinterval; 377 struct timespec now; 378 379 if (t != NULL) { 380 /* 381 * TODO adi@apple.com: use dq_avg to derive hinterval. 382 */ 383 hinterval = *t; 384 } 385 386 if (sfb_hinterval != 0) 387 hinterval = sfb_hinterval; 388 else if (t == NULL || hinterval == 0) 389 hinterval = ((u_int64_t)SFB_HINTERVAL(sp) * NSEC_PER_SEC); 390 391 net_nsectimer(&hinterval, &sp->sfb_hinterval); 392 393 nanouptime(&now); 394 net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset); 395} 396 397/* 398 * sfb support routines 399 */ 400struct sfb * 401sfb_alloc(struct ifnet *ifp, u_int32_t qid, u_int32_t qlim, u_int32_t flags) 402{ 403 struct sfb *sp; 404 int i; 405 406 VERIFY(ifp != NULL && qlim > 0); 407 408 sp = zalloc(sfb_zone); 409 if (sp == NULL) { 410 log(LOG_ERR, "%s: SFB unable to allocate\n", if_name(ifp)); 411 return (NULL); 412 } 413 bzero(sp, sfb_size); 414 415 if ((sp->sfb_bins = zalloc(sfb_bins_zone)) == NULL) { 416 log(LOG_ERR, "%s: SFB unable to allocate bins\n", if_name(ifp)); 417 sfb_destroy(sp); 418 return (NULL); 419 } 420 bzero(sp->sfb_bins, sfb_bins_size); 421 422 if ((sp->sfb_fc_lists = zalloc(sfb_fcl_zone)) == NULL) { 423 log(LOG_ERR, "%s: SFB unable to allocate flow control lists\n", 424 if_name(ifp)); 425 sfb_destroy(sp); 426 return(NULL); 427 } 428 bzero(sp->sfb_fc_lists, sfb_fcl_size); 429 430 for (i = 0; i < SFB_BINS; ++i) 431 STAILQ_INIT(&SFB_FC_LIST(sp, i)->fclist); 432 433 sp->sfb_ifp = ifp; 434 sp->sfb_qlim = qlim; 435 sp->sfb_qid = qid; 436 sp->sfb_flags = (flags & SFBF_USERFLAGS); 437#if !PF_ECN 438 if (sp->sfb_flags & SFBF_ECN) { 439 sp->sfb_flags &= ~SFBF_ECN; 440 log(LOG_ERR, "%s: SFB qid=%d, ECN not available; ignoring " 441 "SFBF_ECN flag!\n", if_name(ifp), sp->sfb_qid); 442 } 443#endif /* !PF_ECN */ 444 445 sfb_resetq(sp, -1); 446 447 return (sp); 448} 449 450static void 451sfb_fclist_append(struct sfb *sp, struct sfb_fcl *fcl) 452{ 453 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); 454 455 VERIFY(STAILQ_EMPTY(&fcl->fclist) || fcl->cnt > 0); 456 sp->sfb_stats.flow_feedback += fcl->cnt; 457 fcl->cnt = 0; 458 459 flowadv_add(&fcl->fclist); 460 VERIFY(fcl->cnt == 0 && STAILQ_EMPTY(&fcl->fclist)); 461} 462 463static void 464sfb_fclists_clean(struct sfb *sp) 465{ 466 int i; 467 468 /* Move all the flow control entries to the flowadv list */ 469 for (i = 0; i < SFB_BINS; ++i) { 470 struct sfb_fcl *fcl = SFB_FC_LIST(sp, i); 471 if (!STAILQ_EMPTY(&fcl->fclist)) 472 sfb_fclist_append(sp, fcl); 473 } 474} 475 476void 477sfb_destroy(struct sfb *sp) 478{ 479 sfb_fclists_clean(sp); 480 if (sp->sfb_bins != NULL) { 481 zfree(sfb_bins_zone, sp->sfb_bins); 482 sp->sfb_bins = NULL; 483 } 484 if (sp->sfb_fc_lists != NULL) { 485 zfree(sfb_fcl_zone, sp->sfb_fc_lists); 486 sp->sfb_fc_lists = NULL; 487 } 488 zfree(sfb_zone, sp); 489} 490 491static void 492sfb_resetq(struct sfb *sp, cqev_t ev) 493{ 494 struct ifnet *ifp = sp->sfb_ifp; 495 u_int64_t eff_rate; 496 497 VERIFY(ifp != NULL); 498 499 if (ev != CLASSQ_EV_LINK_DOWN) { 500 (*sp->sfb_bins)[0].fudge = sfb_random(sp); 501 (*sp->sfb_bins)[1].fudge = sfb_random(sp); 502 sp->sfb_allocation = ((sfb_allocation == 0) ? 503 (sp->sfb_qlim / 3) : sfb_allocation); 504 sp->sfb_drop_thresh = sp->sfb_allocation + 505 (sp->sfb_allocation >> 1); 506 } 507 508 sp->sfb_clearpkts = 0; 509 sp->sfb_current = 0; 510 511 eff_rate = ifnet_output_linkrate(ifp); 512 sp->sfb_eff_rate = eff_rate; 513 514 sfb_calc_holdtime(sp, eff_rate); 515 sfb_calc_pboxtime(sp, eff_rate); 516 sfb_calc_hinterval(sp, NULL); 517 518 if (ev == CLASSQ_EV_LINK_DOWN || 519 ev == CLASSQ_EV_LINK_UP) 520 sfb_fclists_clean(sp); 521 522 bzero(sp->sfb_bins, sizeof (*sp->sfb_bins)); 523 bzero(&sp->sfb_stats, sizeof (sp->sfb_stats)); 524 525 if (ev == CLASSQ_EV_LINK_DOWN || !classq_verbose) 526 return; 527 528 log(LOG_DEBUG, "%s: SFB qid=%d, holdtime=%llu nsec, " 529 "pboxtime=%llu nsec, allocation=%d, drop_thresh=%d, " 530 "hinterval=%d sec, sfb_bins=%d bytes, eff_rate=%llu bps\n", 531 if_name(ifp), sp->sfb_qid, (u_int64_t)sp->sfb_holdtime.tv_nsec, 532 (u_int64_t)sp->sfb_pboxtime.tv_nsec, 533 (u_int32_t)sp->sfb_allocation, (u_int32_t)sp->sfb_drop_thresh, 534 (int)sp->sfb_hinterval.tv_sec, (int)sizeof (*sp->sfb_bins), 535 eff_rate); 536} 537 538void 539sfb_getstats(struct sfb *sp, struct sfb_stats *sps) 540{ 541 sps->allocation = sp->sfb_allocation; 542 sps->dropthresh = sp->sfb_drop_thresh; 543 sps->clearpkts = sp->sfb_clearpkts; 544 sps->current = sp->sfb_current; 545 546 net_timernsec(&sp->sfb_holdtime, &sp->sfb_stats.hold_time); 547 net_timernsec(&sp->sfb_pboxtime, &sp->sfb_stats.pbox_time); 548 net_timernsec(&sp->sfb_hinterval, &sp->sfb_stats.rehash_intval); 549 *(&(sps->sfbstats)) = *(&(sp->sfb_stats)); 550 551 _CASSERT(sizeof ((*sp->sfb_bins)[0].stats) == 552 sizeof (sps->binstats[0].stats)); 553 554 bcopy(&(*sp->sfb_bins)[0].stats, &sps->binstats[0].stats, 555 sizeof (sps->binstats[0].stats)); 556 bcopy(&(*sp->sfb_bins)[1].stats, &sps->binstats[1].stats, 557 sizeof (sps->binstats[1].stats)); 558} 559 560static void 561sfb_swap_bins(struct sfb *sp, u_int32_t len) 562{ 563 int i, j, s; 564 565 if (sp->sfb_flags & SFBF_SUSPENDED) 566 return; 567 568 s = sp->sfb_current; 569 VERIFY((s + (s ^ 1)) == 1); 570 571 (*sp->sfb_bins)[s].fudge = sfb_random(sp); /* recompute perturbation */ 572 sp->sfb_clearpkts = len; 573 sp->sfb_stats.num_rehash++; 574 575 s = (sp->sfb_current ^= 1); /* flip the bit (swap current) */ 576 577 if (classq_verbose) { 578 log(LOG_DEBUG, "%s: SFB qid=%d, set %d is now current, " 579 "qlen=%d\n", if_name(sp->sfb_ifp), sp->sfb_qid, s, len); 580 } 581 582 /* clear freezetime for all current bins */ 583 bzero(&(*sp->sfb_bins)[s].freezetime, 584 sizeof ((*sp->sfb_bins)[s].freezetime)); 585 586 /* clear/adjust bin statistics and flow control lists */ 587 for (i = 0; i < SFB_BINS; i++) { 588 struct sfb_fcl *fcl = SFB_FC_LIST(sp, i); 589 590 if (!STAILQ_EMPTY(&fcl->fclist)) 591 sfb_fclist_append(sp, fcl); 592 593 for (j = 0; j < SFB_LEVELS; j++) { 594 struct sfbbinstats *cbin, *wbin; 595 596 cbin = SFB_BINST(sp, j, i, s); /* current */ 597 wbin = SFB_BINST(sp, j, i, s ^ 1); /* warm-up */ 598 599 cbin->pkts = 0; 600 if (cbin->pmark > SFB_MAX_PMARK) 601 cbin->pmark = SFB_MAX_PMARK; 602 if (cbin->pmark < 0) 603 cbin->pmark = 0; 604 605 /* 606 * Keep pmark from before to identify 607 * non-responsives immediately. 608 */ 609 if (wbin->pmark > SFB_PMARK_WARM) 610 wbin->pmark = SFB_PMARK_WARM; 611 } 612 } 613} 614 615static inline int 616sfb_pcheck(struct sfb *sp, struct pkthdr *pkt) 617{ 618#if SFB_LEVELS != 2 619 int i, n; 620#endif /* SFB_LEVELS != 2 */ 621 int s; 622 623 s = sp->sfb_current; 624 VERIFY((s + (s ^ 1)) == 1); 625 626 /* 627 * For current bins, returns 1 if all pmark >= SFB_PMARK_TH, 628 * 0 otherwise; optimize for SFB_LEVELS=2. 629 */ 630#if SFB_LEVELS == 2 631 /* 632 * Level 0: bin index at [0] for set 0; [2] for set 1 633 * Level 1: bin index at [1] for set 0; [3] for set 1 634 */ 635 if (SFB_BINST(sp, 0, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]), 636 s)->pmark < SFB_PMARK_TH || 637 SFB_BINST(sp, 1, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]), 638 s)->pmark < SFB_PMARK_TH) 639 return (0); 640#else /* SFB_LEVELS != 2 */ 641 for (i = 0; i < SFB_LEVELS; i++) { 642 if (s == 0) /* set 0, bin index [0,1] */ 643 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]); 644 else /* set 1, bin index [2,3] */ 645 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]); 646 647 if (SFB_BINST(sp, i, n, s)->pmark < SFB_PMARK_TH) 648 return (0); 649 } 650#endif /* SFB_LEVELS != 2 */ 651 return (1); 652} 653 654static int 655sfb_penalize(struct sfb *sp, struct pkthdr *pkt, struct timespec *now) 656{ 657 struct timespec delta = { 0, 0 }; 658 659 /* If minimum pmark of current bins is < SFB_PMARK_TH, we're done */ 660 if (!sfb_ratelimit || !sfb_pcheck(sp, pkt)) 661 return (0); 662 663 net_timersub(now, &sp->sfb_pboxfreeze, &delta); 664 if (net_timercmp(&delta, &sp->sfb_pboxtime, <)) { 665#if SFB_LEVELS != 2 666 int i; 667#endif /* SFB_LEVELS != 2 */ 668 struct sfbbinstats *bin; 669 int n, w; 670 671 w = sp->sfb_current ^ 1; 672 VERIFY((w + (w ^ 1)) == 1); 673 674 /* 675 * Update warm-up bins; optimize for SFB_LEVELS=2 676 */ 677#if SFB_LEVELS == 2 678 /* Level 0: bin index at [0] for set 0; [2] for set 1 */ 679 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(w << 1)]); 680 bin = SFB_BINST(sp, 0, n, w); 681 if (bin->pkts >= sp->sfb_allocation) 682 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, w), now); 683 684 /* Level 0: bin index at [1] for set 0; [3] for set 1 */ 685 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(w << 1) + 1]); 686 bin = SFB_BINST(sp, 1, n, w); 687 if (bin->pkts >= sp->sfb_allocation) 688 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, w), now); 689#else /* SFB_LEVELS != 2 */ 690 for (i = 0; i < SFB_LEVELS; i++) { 691 if (w == 0) /* set 0, bin index [0,1] */ 692 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]); 693 else /* set 1, bin index [2,3] */ 694 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]); 695 696 bin = SFB_BINST(sp, i, n, w); 697 if (bin->pkts >= sp->sfb_allocation) { 698 sfb_increment_bin(sp, bin, 699 SFB_BINFT(sp, i, n, w), now); 700 } 701 } 702#endif /* SFB_LEVELS != 2 */ 703 return (1); 704 } 705 706 /* non-conformant or else misclassified flow; queue it anyway */ 707 pkt->pkt_sfb_flags |= SFB_PKT_PBOX; 708 *(&sp->sfb_pboxfreeze) = *now; 709 710 return (0); 711} 712 713static void 714sfb_adjust_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft, 715 struct timespec *now, boolean_t inc) 716{ 717 struct timespec delta; 718 719 net_timersub(now, ft, &delta); 720 if (net_timercmp(&delta, &sp->sfb_holdtime, <)) { 721 if (classq_verbose > 1) { 722 log(LOG_DEBUG, "%s: SFB qid=%d, %s update frozen " 723 "(delta=%llu nsec)\n", if_name(sp->sfb_ifp), 724 sp->sfb_qid, inc ? "increment" : "decrement", 725 (u_int64_t)delta.tv_nsec); 726 } 727 return; 728 } 729 730 /* increment/decrement marking probability */ 731 *ft = *now; 732 if (inc) 733 SFB_PMARK_INC(bin); 734 else 735 SFB_PMARK_DEC(bin); 736} 737 738static void 739sfb_decrement_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft, 740 struct timespec *now) 741{ 742 return (sfb_adjust_bin(sp, bin, ft, now, FALSE)); 743} 744 745static void 746sfb_increment_bin(struct sfb *sp, struct sfbbinstats *bin, struct timespec *ft, 747 struct timespec *now) 748{ 749 return (sfb_adjust_bin(sp, bin, ft, now, TRUE)); 750} 751 752static inline void 753sfb_dq_update_bins(struct sfb *sp, struct pkthdr *pkt, struct timespec *now) 754{ 755#if SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 756 int i; 757#endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */ 758 struct sfbbinstats *bin; 759 int s, n; 760 struct sfb_fcl *fcl = NULL; 761 762 s = sp->sfb_current; 763 VERIFY((s + (s ^ 1)) == 1); 764 765 /* 766 * Update current bins; optimize for SFB_LEVELS=2 and SFB_FC_LEVEL=0 767 */ 768#if SFB_LEVELS == 2 && SFB_FC_LEVEL == 0 769 /* Level 0: bin index at [0] for set 0; [2] for set 1 */ 770 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]); 771 bin = SFB_BINST(sp, 0, n, s); 772 773 VERIFY(bin->pkts > 0); 774 if (--bin->pkts == 0) { 775 sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now); 776 } 777 if (bin->pkts <= (sp->sfb_allocation >> 2)) { 778 /* deliver flow control feedback to the sockets */ 779 fcl = SFB_FC_LIST(sp, n); 780 if (!STAILQ_EMPTY(&fcl->fclist)) 781 sfb_fclist_append(sp, fcl); 782 } 783 784 /* Level 1: bin index at [1] for set 0; [3] for set 1 */ 785 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]); 786 bin = SFB_BINST(sp, 1, n, s); 787 788 VERIFY(bin->pkts > 0); 789 if (--bin->pkts == 0) 790 sfb_decrement_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now); 791#else /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */ 792 for (i = 0; i < SFB_LEVELS; i++) { 793 if (s == 0) /* set 0, bin index [0,1] */ 794 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]); 795 else /* set 1, bin index [2,3] */ 796 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]); 797 798 bin = SFB_BINST(sp, i, n, s); 799 800 VERIFY(bin->pkts > 0); 801 if (--bin->pkts == 0) { 802 sfb_decrement_bin(sp, bin, 803 SFB_BINFT(sp, i, n, s), now); 804 } 805 if (bin->pkts <= (sp->sfb_allocation >> 2)) { 806 /* deliver flow control feedback to the sockets */ 807 if (i == SFB_FC_LEVEL) { 808 fcl = SFB_FC_LIST(sp, n); 809 if (!STAILQ_EMPTY(&fcl->fclist)) 810 sfb_fclist_append(sp, fcl); 811 } 812 } 813 } 814#endif /* SFB_LEVELS != 2 || SFB_FC_LEVEL != 0 */ 815} 816 817static inline void 818sfb_eq_update_bins(struct sfb *sp, struct pkthdr *pkt) 819{ 820#if SFB_LEVELS != 2 821 int i, n; 822#endif /* SFB_LEVELS != 2 */ 823 int s; 824 825 s = sp->sfb_current; 826 VERIFY((s + (s ^ 1)) == 1); 827 828 /* 829 * Update current bins; optimize for SFB_LEVELS=2 830 */ 831#if SFB_LEVELS == 2 832 /* Level 0: bin index at [0] for set 0; [2] for set 1 */ 833 SFB_BINST(sp, 0, 834 SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]), s)->pkts++; 835 836 /* Level 1: bin index at [1] for set 0; [3] for set 1 */ 837 SFB_BINST(sp, 1, 838 SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]), s)->pkts++; 839#else /* SFB_LEVELS != 2 */ 840 for (i = 0; i < SFB_LEVELS; i++) { 841 if (s == 0) /* set 0, bin index [0,1] */ 842 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]); 843 else /* set 1, bin index [2,3] */ 844 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]); 845 846 SFB_BINST(sp, i, n, s)->pkts++; 847 } 848#endif /* SFB_LEVELS != 2 */ 849} 850 851static boolean_t 852sfb_bin_addfcentry(struct sfb *sp, struct pkthdr *pkt) 853{ 854 struct flowadv_fcentry *fce; 855 u_int32_t flowsrc, flowid; 856 struct sfb_fcl *fcl; 857 int s; 858 859 s = sp->sfb_current; 860 VERIFY((s + (s ^ 1)) == 1); 861 862 flowsrc = pkt->pkt_flowsrc; 863 flowid = pkt->pkt_flowid; 864 865 if (flowid == 0) { 866 sp->sfb_stats.null_flowid++; 867 return (FALSE); 868 } 869 870 /* 871 * Use value at index 0 for set 0 and 872 * value at index 2 for set 1 873 */ 874 fcl = SFB_FC_LIST(sp, SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)])); 875 STAILQ_FOREACH(fce, &fcl->fclist, fce_link) { 876 if (fce->fce_flowsrc == flowsrc && 877 fce->fce_flowid == flowid) { 878 /* Already on flow control list; just return */ 879 return (TRUE); 880 } 881 } 882 883 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); 884 fce = flowadv_alloc_entry(M_WAITOK); 885 if (fce != NULL) { 886 fce->fce_flowsrc = flowsrc; 887 fce->fce_flowid = flowid; 888 STAILQ_INSERT_TAIL(&fcl->fclist, fce, fce_link); 889 fcl->cnt++; 890 sp->sfb_stats.flow_controlled++; 891 } 892 893 return (fce != NULL); 894} 895 896/* 897 * early-drop probability is kept in pmark of each bin of the flow 898 */ 899static int 900sfb_drop_early(struct sfb *sp, struct pkthdr *pkt, u_int16_t *pmin, 901 struct timespec *now) 902{ 903#if SFB_LEVELS != 2 904 int i; 905#endif /* SFB_LEVELS != 2 */ 906 struct sfbbinstats *bin; 907 int s, n, ret = 0; 908 909 s = sp->sfb_current; 910 VERIFY((s + (s ^ 1)) == 1); 911 912 *pmin = (u_int16_t)-1; 913 914 /* 915 * Update current bins; optimize for SFB_LEVELS=2 916 */ 917#if SFB_LEVELS == 2 918 /* Level 0: bin index at [0] for set 0; [2] for set 1 */ 919 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1)]); 920 bin = SFB_BINST(sp, 0, n, s); 921 if (*pmin > (u_int16_t)bin->pmark) 922 *pmin = (u_int16_t)bin->pmark; 923 924 if (bin->pkts >= sp->sfb_allocation) { 925 if (bin->pkts >= sp->sfb_drop_thresh) 926 ret = 1; /* drop or mark */ 927 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 0, n, s), now); 928 } 929 930 /* Level 1: bin index at [1] for set 0; [3] for set 1 */ 931 n = SFB_BINMASK(pkt->pkt_sfb_hash8[(s << 1) + 1]); 932 bin = SFB_BINST(sp, 1, n, s); 933 if (*pmin > (u_int16_t)bin->pmark) 934 *pmin = (u_int16_t)bin->pmark; 935 936 if (bin->pkts >= sp->sfb_allocation) { 937 if (bin->pkts >= sp->sfb_drop_thresh) 938 ret = 1; /* drop or mark */ 939 sfb_increment_bin(sp, bin, SFB_BINFT(sp, 1, n, s), now); 940 } 941#else /* SFB_LEVELS != 2 */ 942 for (i = 0; i < SFB_LEVELS; i++) { 943 if (s == 0) /* set 0, bin index [0,1] */ 944 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i]); 945 else /* set 1, bin index [2,3] */ 946 n = SFB_BINMASK(pkt->pkt_sfb_hash8[i + 2]); 947 948 bin = SFB_BINST(sp, i, n, s); 949 if (*pmin > (u_int16_t)bin->pmark) 950 *pmin = (u_int16_t)bin->pmark; 951 952 if (bin->pkts >= sp->sfb_allocation) { 953 if (bin->pkts >= sp->sfb_drop_thresh) 954 ret = 1; /* drop or mark */ 955 sfb_increment_bin(sp, bin, 956 SFB_BINFT(sp, i, n, s), now); 957 } 958 } 959#endif /* SFB_LEVELS != 2 */ 960 961 if (sp->sfb_flags & SFBF_SUSPENDED) 962 ret = 1; /* drop or mark */ 963 964 return (ret); 965} 966 967#define DTYPE_NODROP 0 /* no drop */ 968#define DTYPE_FORCED 1 /* a "forced" drop */ 969#define DTYPE_EARLY 2 /* an "unforced" (early) drop */ 970 971int 972sfb_addq(struct sfb *sp, class_queue_t *q, struct mbuf *m, struct pf_mtag *t) 973{ 974#if !PF_ECN 975#pragma unused(t) 976#endif /* !PF_ECN */ 977 struct pkthdr *pkt = &m->m_pkthdr; 978 struct timespec now; 979 int droptype, s; 980 u_int16_t pmin; 981 int fc_adv = 0; 982 int ret = CLASSQEQ_SUCCESS; 983 984 nanouptime(&now); 985 986 s = sp->sfb_current; 987 VERIFY((s + (s ^ 1)) == 1); 988 989 /* time to swap the bins? */ 990 if (net_timercmp(&now, &sp->sfb_nextreset, >=)) { 991 net_timeradd(&now, &sp->sfb_hinterval, &sp->sfb_nextreset); 992 sfb_swap_bins(sp, qlen(q)); 993 s = sp->sfb_current; 994 VERIFY((s + (s ^ 1)) == 1); 995 } 996 997 pkt->pkt_sfb_flags = 0; 998 pkt->pkt_sfb_hash16[s] = 999 (SFB_HASH(&pkt->pkt_flowid, sizeof (pkt->pkt_flowid), 1000 (*sp->sfb_bins)[s].fudge) & SFB_HASHMASK); 1001 pkt->pkt_sfb_hash16[s ^ 1] = 1002 (SFB_HASH(&pkt->pkt_flowid, sizeof (pkt->pkt_flowid), 1003 (*sp->sfb_bins)[s ^ 1].fudge) & SFB_HASHMASK); 1004 1005 /* see if we drop early */ 1006 droptype = DTYPE_NODROP; 1007 if (sfb_drop_early(sp, pkt, &pmin, &now)) { 1008 /* flow control, mark or drop by sfb */ 1009 if ((sp->sfb_flags & SFBF_FLOWCTL) && 1010 (pkt->pkt_flags & PKTF_FLOW_ADV)) { 1011 fc_adv = 1; 1012 /* drop all during suspension or for non-TCP */ 1013 if ((sp->sfb_flags & SFBF_SUSPENDED) || 1014 pkt->pkt_proto != IPPROTO_TCP) { 1015 droptype = DTYPE_EARLY; 1016 sp->sfb_stats.drop_early++; 1017 } 1018 } 1019#if PF_ECN 1020 else if ((sp->sfb_flags & SFBF_ECN) && 1021 (pkt->pkt_proto == IPPROTO_TCP) && /* only for TCP */ 1022 ((sfb_random(sp) & SFB_MAX_PMARK) <= pmin) && 1023 mark_ecn(m, t, sp->sfb_flags) && 1024 !(sp->sfb_flags & SFBF_SUSPENDED)) { 1025 /* successfully marked; do not drop. */ 1026 sp->sfb_stats.marked_packets++; 1027 } 1028#endif /* PF_ECN */ 1029 else { 1030 /* unforced drop by sfb */ 1031 droptype = DTYPE_EARLY; 1032 sp->sfb_stats.drop_early++; 1033 } 1034 } 1035 1036 /* non-responsive flow penalty? */ 1037 if (droptype == DTYPE_NODROP && sfb_penalize(sp, pkt, &now)) { 1038 droptype = DTYPE_FORCED; 1039 sp->sfb_stats.drop_pbox++; 1040 } 1041 1042 /* if the queue length hits the hard limit, it's a forced drop */ 1043 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) { 1044 droptype = DTYPE_FORCED; 1045 sp->sfb_stats.drop_queue++; 1046 } 1047 1048 if (fc_adv == 1 && droptype != DTYPE_FORCED && 1049 sfb_bin_addfcentry(sp, pkt)) { 1050 /* deliver flow control advisory error */ 1051 if (droptype == DTYPE_NODROP) { 1052 ret = CLASSQEQ_SUCCESS_FC; 1053 VERIFY(!(sp->sfb_flags & SFBF_SUSPENDED)); 1054 } else if (sp->sfb_flags & SFBF_SUSPENDED) { 1055 /* dropped due to suspension */ 1056 ret = CLASSQEQ_DROPPED_SP; 1057 } else { 1058 /* dropped due to flow-control */ 1059 ret = CLASSQEQ_DROPPED_FC; 1060 } 1061 } 1062 1063 /* if successful enqueue this packet, else drop it */ 1064 if (droptype == DTYPE_NODROP) { 1065 _addq(q, m); 1066 } else { 1067 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); 1068 m_freem(m); 1069 return ((ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROPPED); 1070 } 1071 1072 if (!(pkt->pkt_sfb_flags & SFB_PKT_PBOX)) 1073 sfb_eq_update_bins(sp, pkt); 1074 else 1075 sp->sfb_stats.pbox_packets++; 1076 1077 /* successfully queued */ 1078 return (ret); 1079} 1080 1081static struct mbuf * 1082sfb_getq_flow(struct sfb *sp, class_queue_t *q, u_int32_t flow, boolean_t purge) 1083{ 1084 struct timespec now; 1085 struct mbuf *m; 1086 struct pkthdr *pkt; 1087 1088 if (!purge && (sp->sfb_flags & SFBF_SUSPENDED)) 1089 return (NULL); 1090 1091 nanouptime(&now); 1092 1093 /* flow of 0 means head of queue */ 1094 if ((m = ((flow == 0) ? _getq(q) : _getq_flow(q, flow))) == NULL) { 1095 if (!purge) 1096 net_timerclear(&sp->sfb_getqtime); 1097 return (NULL); 1098 } 1099 1100 VERIFY(m->m_flags & M_PKTHDR); 1101 1102 pkt = &m->m_pkthdr; 1103 1104 if (!purge) { 1105 /* calculate EWMA of dequeues */ 1106 if (net_timerisset(&sp->sfb_getqtime)) { 1107 struct timespec delta; 1108 u_int64_t avg, new; 1109 1110 net_timersub(&now, &sp->sfb_getqtime, &delta); 1111 net_timernsec(&delta, &new); 1112 avg = sp->sfb_stats.dequeue_avg; 1113 if (avg > 0) { 1114 int decay = DEQUEUE_DECAY; 1115 /* 1116 * If the time since last dequeue is 1117 * significantly greater than the current 1118 * average, weight the average more against 1119 * the old value. 1120 */ 1121 if (DEQUEUE_SPIKE(new, avg)) 1122 decay += 5; 1123 avg = (((avg << decay) - avg) + new) >> decay; 1124 } else { 1125 avg = new; 1126 } 1127 sp->sfb_stats.dequeue_avg = avg; 1128 } 1129 *(&sp->sfb_getqtime) = *(&now); 1130 } 1131 1132 /* 1133 * Clearpkts are the ones which were in the queue when the hash 1134 * function was perturbed. Since the perturbation value (fudge), 1135 * and thus bin information for these packets is not known, we do 1136 * not change accounting information while dequeuing these packets. 1137 * It is important not to set the hash interval too small due to 1138 * this reason. A rule of thumb is to set it to K*D, where D is 1139 * the time taken to drain queue. 1140 */ 1141 if (pkt->pkt_sfb_flags & SFB_PKT_PBOX) { 1142 pkt->pkt_sfb_flags &= ~SFB_PKT_PBOX; 1143 if (sp->sfb_clearpkts > 0) 1144 sp->sfb_clearpkts--; 1145 } else if (sp->sfb_clearpkts > 0) { 1146 sp->sfb_clearpkts--; 1147 } else { 1148 sfb_dq_update_bins(sp, pkt, &now); 1149 } 1150 1151 return (m); 1152} 1153 1154struct mbuf * 1155sfb_getq(struct sfb *sp, class_queue_t *q) 1156{ 1157 return (sfb_getq_flow(sp, q, 0, FALSE)); 1158} 1159 1160void 1161sfb_purgeq(struct sfb *sp, class_queue_t *q, u_int32_t flow, u_int32_t *packets, 1162 u_int32_t *bytes) 1163{ 1164 u_int32_t cnt = 0, len = 0; 1165 struct mbuf *m; 1166 1167 IFCQ_CONVERT_LOCK(&sp->sfb_ifp->if_snd); 1168 1169 while ((m = sfb_getq_flow(sp, q, flow, TRUE)) != NULL) { 1170 cnt++; 1171 len += m_pktlen(m); 1172 m_freem(m); 1173 } 1174 1175 if (packets != NULL) 1176 *packets = cnt; 1177 if (bytes != NULL) 1178 *bytes = len; 1179} 1180 1181void 1182sfb_updateq(struct sfb *sp, cqev_t ev) 1183{ 1184 struct ifnet *ifp = sp->sfb_ifp; 1185 1186 VERIFY(ifp != NULL); 1187 1188 switch (ev) { 1189 case CLASSQ_EV_LINK_BANDWIDTH: { 1190 u_int64_t eff_rate = ifnet_output_linkrate(ifp); 1191 1192 /* update parameters only if rate has changed */ 1193 if (eff_rate == sp->sfb_eff_rate) 1194 break; 1195 1196 if (classq_verbose) { 1197 log(LOG_DEBUG, "%s: SFB qid=%d, adapting to new " 1198 "eff_rate=%llu bps\n", if_name(ifp), sp->sfb_qid, 1199 eff_rate); 1200 } 1201 sfb_calc_holdtime(sp, eff_rate); 1202 sfb_calc_pboxtime(sp, eff_rate); 1203 break; 1204 } 1205 1206 case CLASSQ_EV_LINK_UP: 1207 case CLASSQ_EV_LINK_DOWN: 1208 if (classq_verbose) { 1209 log(LOG_DEBUG, "%s: SFB qid=%d, resetting due to " 1210 "link %s\n", if_name(ifp), sp->sfb_qid, 1211 (ev == CLASSQ_EV_LINK_UP) ? "UP" : "DOWN"); 1212 } 1213 sfb_resetq(sp, ev); 1214 break; 1215 1216 case CLASSQ_EV_LINK_LATENCY: 1217 case CLASSQ_EV_LINK_MTU: 1218 default: 1219 break; 1220 } 1221} 1222 1223int 1224sfb_suspendq(struct sfb *sp, class_queue_t *q, boolean_t on) 1225{ 1226#pragma unused(q) 1227 struct ifnet *ifp = sp->sfb_ifp; 1228 1229 VERIFY(ifp != NULL); 1230 1231 if ((on && (sp->sfb_flags & SFBF_SUSPENDED)) || 1232 (!on && !(sp->sfb_flags & SFBF_SUSPENDED))) 1233 return (0); 1234 1235 if (!(sp->sfb_flags & SFBF_FLOWCTL)) { 1236 log(LOG_ERR, "%s: SFB qid=%d, unable to %s queue since " 1237 "flow-control is not enabled", if_name(ifp), sp->sfb_qid, 1238 (on ? "suspend" : "resume")); 1239 return (ENOTSUP); 1240 } 1241 1242 if (classq_verbose) { 1243 log(LOG_DEBUG, "%s: SFB qid=%d, setting state to %s", 1244 if_name(ifp), sp->sfb_qid, (on ? "SUSPENDED" : "RUNNING")); 1245 } 1246 1247 if (on) { 1248 sp->sfb_flags |= SFBF_SUSPENDED; 1249 } else { 1250 sp->sfb_flags &= ~SFBF_SUSPENDED; 1251 sfb_swap_bins(sp, qlen(q)); 1252 } 1253 1254 return (0); 1255} 1256