1/*
2 * Copyright (c) 2011-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Copyright (c) 2010 Fabio Checconi, Luigi Rizzo, Paolo Valente
31 * All rights reserved
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 */
54
55/*
56 * Quick Fair Queueing is described in
57 * "QFQ: Efficient Packet Scheduling with Tight Bandwidth Distribution
58 * Guarantees" by Fabio Checconi, Paolo Valente, and Luigi Rizzo.
59 *
60 * This code is ported from the dummynet(4) QFQ implementation.
61 * See also http://info.iet.unipi.it/~luigi/qfq/
62 */
63
64#include <sys/cdefs.h>
65#include <sys/param.h>
66#include <sys/malloc.h>
67#include <sys/mbuf.h>
68#include <sys/systm.h>
69#include <sys/errno.h>
70#include <sys/kernel.h>
71#include <sys/syslog.h>
72
73#include <kern/zalloc.h>
74
75#include <net/if.h>
76#include <net/net_osdep.h>
77
78#include <net/pktsched/pktsched_qfq.h>
79#include <netinet/in.h>
80
81/*
82 * function prototypes
83 */
84static int qfq_enqueue_ifclassq(struct ifclassq *, struct mbuf *);
85static struct mbuf *qfq_dequeue_ifclassq(struct ifclassq *, cqdq_op_t);
86static int qfq_request_ifclassq(struct ifclassq *, cqrq_t, void *);
87static int qfq_clear_interface(struct qfq_if *);
88static struct qfq_class *qfq_class_create(struct qfq_if *, u_int32_t,
89    u_int32_t, u_int32_t, u_int32_t, u_int32_t);
90static int qfq_class_destroy(struct qfq_if *, struct qfq_class *);
91static int qfq_destroy_locked(struct qfq_if *);
92static inline int qfq_addq(struct qfq_class *, struct mbuf *, struct pf_mtag *);
93static inline struct mbuf *qfq_getq(struct qfq_class *);
94static inline struct mbuf *qfq_pollq(struct qfq_class *);
95static void qfq_purgeq(struct qfq_if *, struct qfq_class *, u_int32_t,
96    u_int32_t *, u_int32_t *);
97static void qfq_purge_sc(struct qfq_if *, cqrq_purge_sc_t *);
98static void qfq_updateq(struct qfq_if *, struct qfq_class *, cqev_t);
99static int qfq_throttle(struct qfq_if *, cqrq_throttle_t *);
100static int qfq_resumeq(struct qfq_if *, struct qfq_class *);
101static int qfq_suspendq(struct qfq_if *, struct qfq_class *);
102static int qfq_stat_sc(struct qfq_if *, cqrq_stat_sc_t *);
103static inline struct qfq_class *qfq_clh_to_clp(struct qfq_if *, u_int32_t);
104static const char *qfq_style(struct qfq_if *);
105
106static inline int qfq_gt(u_int64_t, u_int64_t);
107static inline u_int64_t qfq_round_down(u_int64_t, u_int32_t);
108static inline struct qfq_group *qfq_ffs(struct qfq_if *, pktsched_bitmap_t);
109static int qfq_calc_index(struct qfq_class *, u_int32_t, u_int32_t);
110static inline pktsched_bitmap_t mask_from(pktsched_bitmap_t, int);
111static inline u_int32_t qfq_calc_state(struct qfq_if *, struct qfq_group *);
112static inline void qfq_move_groups(struct qfq_if *, pktsched_bitmap_t,
113    int, int);
114static inline void qfq_unblock_groups(struct qfq_if *, int, u_int64_t);
115static inline void qfq_make_eligible(struct qfq_if *, u_int64_t);
116static inline void qfq_slot_insert(struct qfq_if *, struct qfq_group *,
117    struct qfq_class *, u_int64_t);
118static inline void qfq_front_slot_remove(struct qfq_group *);
119static inline struct qfq_class *qfq_slot_scan(struct qfq_if *,
120    struct qfq_group *);
121static inline void qfq_slot_rotate(struct qfq_if *, struct qfq_group *,
122    u_int64_t);
123static inline void qfq_update_eligible(struct qfq_if *, u_int64_t);
124static inline int qfq_update_class(struct qfq_if *, struct qfq_group *,
125    struct qfq_class *);
126static inline void qfq_update_start(struct qfq_if *, struct qfq_class *);
127static inline void qfq_slot_remove(struct qfq_if *, struct qfq_group *,
128    struct qfq_class *);
129static void qfq_deactivate_class(struct qfq_if *, struct qfq_class *);
130static const char *qfq_state2str(int);
131#if QFQ_DEBUG
132static void qfq_dump_groups(struct qfq_if *, u_int32_t);
133static void qfq_dump_sched(struct qfq_if *, const char *);
134#endif /* QFQ_DEBUG */
135
136#define	QFQ_ZONE_MAX	32		/* maximum elements in zone */
137#define	QFQ_ZONE_NAME	"pktsched_qfq"	/* zone name */
138
139static unsigned int qfq_size;		/* size of zone element */
140static struct zone *qfq_zone;		/* zone for qfq */
141
142#define	QFQ_CL_ZONE_MAX	32	/* maximum elements in zone */
143#define	QFQ_CL_ZONE_NAME	"pktsched_qfq_cl" /* zone name */
144
145static unsigned int qfq_cl_size;	/* size of zone element */
146static struct zone *qfq_cl_zone;	/* zone for qfq_class */
147
148/*
149 * Maximum number of consecutive slots occupied by backlogged classes
150 * inside a group.  This is approx lmax/lmin + 5.  Used when ALTQ is
151 * available.
152 *
153 * XXX check because it poses constraints on MAX_INDEX
154 */
155#define	QFQ_MAX_SLOTS	32	/* default when ALTQ is available */
156
157void
158qfq_init(void)
159{
160	qfq_size = sizeof (struct qfq_if);
161	qfq_zone = zinit(qfq_size, QFQ_ZONE_MAX * qfq_size,
162	    0, QFQ_ZONE_NAME);
163	if (qfq_zone == NULL) {
164		panic("%s: failed allocating %s", __func__, QFQ_ZONE_NAME);
165		/* NOTREACHED */
166	}
167	zone_change(qfq_zone, Z_EXPAND, TRUE);
168	zone_change(qfq_zone, Z_CALLERACCT, TRUE);
169
170	qfq_cl_size = sizeof (struct qfq_class);
171	qfq_cl_zone = zinit(qfq_cl_size, QFQ_CL_ZONE_MAX * qfq_cl_size,
172	    0, QFQ_CL_ZONE_NAME);
173	if (qfq_cl_zone == NULL) {
174		panic("%s: failed allocating %s", __func__, QFQ_CL_ZONE_NAME);
175		/* NOTREACHED */
176	}
177	zone_change(qfq_cl_zone, Z_EXPAND, TRUE);
178	zone_change(qfq_cl_zone, Z_CALLERACCT, TRUE);
179}
180
181struct qfq_if *
182qfq_alloc(struct ifnet *ifp, int how, boolean_t altq)
183{
184	struct qfq_if	*qif;
185
186	qif = (how == M_WAITOK) ? zalloc(qfq_zone) : zalloc_noblock(qfq_zone);
187	if (qif == NULL)
188		return (NULL);
189
190	bzero(qif, qfq_size);
191	qif->qif_ifq = &ifp->if_snd;
192	if (altq) {
193		qif->qif_maxclasses = QFQ_MAX_CLASSES;
194		qif->qif_maxslots = QFQ_MAX_SLOTS;
195		qif->qif_flags |= QFQIFF_ALTQ;
196	} else {
197		qif->qif_maxclasses = IFCQ_SC_MAX;
198		/*
199		 * TODO: adi@apple.com
200		 *
201		 * Ideally I would like to have the following
202		 * but QFQ needs further modifications.
203		 *
204		 *	qif->qif_maxslots = IFCQ_SC_MAX;
205		 */
206		qif->qif_maxslots = QFQ_MAX_SLOTS;
207	}
208
209	if ((qif->qif_class_tbl = _MALLOC(sizeof (struct qfq_class *) *
210	    qif->qif_maxclasses, M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
211		log(LOG_ERR, "%s: %s unable to allocate class table array\n",
212		    if_name(ifp), qfq_style(qif));
213		goto error;
214	}
215
216	if ((qif->qif_groups = _MALLOC(sizeof (struct qfq_group *) *
217	    (QFQ_MAX_INDEX + 1), M_DEVBUF, M_WAITOK|M_ZERO)) == NULL) {
218		log(LOG_ERR, "%s: %s unable to allocate group array\n",
219		    if_name(ifp), qfq_style(qif));
220		goto error;
221	}
222
223	if (pktsched_verbose) {
224		log(LOG_DEBUG, "%s: %s scheduler allocated\n",
225		    if_name(ifp), qfq_style(qif));
226	}
227
228	return (qif);
229
230error:
231	if (qif->qif_class_tbl != NULL) {
232		_FREE(qif->qif_class_tbl, M_DEVBUF);
233		qif->qif_class_tbl = NULL;
234	}
235	if (qif->qif_groups != NULL) {
236		_FREE(qif->qif_groups, M_DEVBUF);
237		qif->qif_groups = NULL;
238	}
239	zfree(qfq_zone, qif);
240
241	return (NULL);
242}
243
244int
245qfq_destroy(struct qfq_if *qif)
246{
247	struct ifclassq *ifq = qif->qif_ifq;
248	int err;
249
250	IFCQ_LOCK(ifq);
251	err = qfq_destroy_locked(qif);
252	IFCQ_UNLOCK(ifq);
253
254	return (err);
255}
256
257static int
258qfq_destroy_locked(struct qfq_if *qif)
259{
260	int i;
261
262	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
263
264	(void) qfq_clear_interface(qif);
265
266	VERIFY(qif->qif_class_tbl != NULL);
267	_FREE(qif->qif_class_tbl, M_DEVBUF);
268	qif->qif_class_tbl = NULL;
269
270	VERIFY(qif->qif_groups != NULL);
271	for (i = 0; i <= QFQ_MAX_INDEX; i++) {
272		struct qfq_group *grp = qif->qif_groups[i];
273
274		if (grp != NULL) {
275			VERIFY(grp->qfg_slots != NULL);
276			_FREE(grp->qfg_slots, M_DEVBUF);
277			grp->qfg_slots = NULL;
278			_FREE(grp, M_DEVBUF);
279			qif->qif_groups[i] = NULL;
280		}
281	}
282	_FREE(qif->qif_groups, M_DEVBUF);
283	qif->qif_groups = NULL;
284
285	if (pktsched_verbose) {
286		log(LOG_DEBUG, "%s: %s scheduler destroyed\n",
287		    if_name(QFQIF_IFP(qif)), qfq_style(qif));
288	}
289
290	zfree(qfq_zone, qif);
291
292	return (0);
293}
294
295/*
296 * bring the interface back to the initial state by discarding
297 * all the filters and classes.
298 */
299static int
300qfq_clear_interface(struct qfq_if *qif)
301{
302	struct qfq_class *cl;
303	int i;
304
305	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
306
307	/* clear out the classes */
308	for (i = 0; i < qif->qif_maxclasses; i++)
309		if ((cl = qif->qif_class_tbl[i]) != NULL)
310			qfq_class_destroy(qif, cl);
311
312	return (0);
313}
314
315/* discard all the queued packets on the interface */
316void
317qfq_purge(struct qfq_if *qif)
318{
319	struct qfq_class *cl;
320	int i;
321
322	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
323
324	for (i = 0; i < qif->qif_maxclasses; i++) {
325		if ((cl = qif->qif_class_tbl[i]) != NULL)
326			qfq_purgeq(qif, cl, 0, NULL, NULL);
327	}
328#if !PF_ALTQ
329	/*
330	 * This assertion is safe to be made only when PF_ALTQ is not
331	 * configured; otherwise, IFCQ_LEN represents the sum of the
332	 * packets managed by ifcq_disc and altq_disc instances, which
333	 * is possible when transitioning between the two.
334	 */
335	VERIFY(IFCQ_LEN(qif->qif_ifq) == 0);
336#endif /* !PF_ALTQ */
337}
338
339static void
340qfq_purge_sc(struct qfq_if *qif, cqrq_purge_sc_t *pr)
341{
342	struct ifclassq *ifq = qif->qif_ifq;
343	u_int32_t i;
344
345	IFCQ_LOCK_ASSERT_HELD(ifq);
346
347	VERIFY(pr->sc == MBUF_SC_UNSPEC || MBUF_VALID_SC(pr->sc));
348	VERIFY(pr->flow != 0);
349
350	if (pr->sc != MBUF_SC_UNSPEC) {
351		i = MBUF_SCIDX(pr->sc);
352		VERIFY(i < IFCQ_SC_MAX);
353
354		qfq_purgeq(qif, ifq->ifcq_disc_slots[i].cl,
355		    pr->flow, &pr->packets, &pr->bytes);
356	} else {
357		u_int32_t cnt, len;
358
359		pr->packets = 0;
360		pr->bytes = 0;
361
362		for (i = 0; i < IFCQ_SC_MAX; i++) {
363			qfq_purgeq(qif, ifq->ifcq_disc_slots[i].cl,
364			    pr->flow, &cnt, &len);
365			pr->packets += cnt;
366			pr->bytes += len;
367		}
368	}
369}
370
371void
372qfq_event(struct qfq_if *qif, cqev_t ev)
373{
374	struct qfq_class *cl;
375	int i;
376
377	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
378
379	for (i = 0; i < qif->qif_maxclasses; i++)
380		if ((cl = qif->qif_class_tbl[i]) != NULL)
381			qfq_updateq(qif, cl, ev);
382}
383
384int
385qfq_add_queue(struct qfq_if *qif, u_int32_t qlimit, u_int32_t weight,
386    u_int32_t maxsz, u_int32_t flags, u_int32_t qid, struct qfq_class **clp)
387{
388	struct qfq_class *cl;
389	u_int32_t w;
390
391	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
392
393	if (qfq_clh_to_clp(qif, qid) != NULL)
394		return (EBUSY);
395
396	/* check parameters */
397	if (weight == 0 || weight > QFQ_MAX_WEIGHT)
398		return (EINVAL);
399
400	w = (QFQ_ONE_FP / (QFQ_ONE_FP / weight));
401	if (qif->qif_wsum + w > QFQ_MAX_WSUM)
402		return (EINVAL);
403
404	if (maxsz == 0 || maxsz > (1 << QFQ_MTU_SHIFT))
405		return (EINVAL);
406
407	cl = qfq_class_create(qif, weight, qlimit, flags, maxsz, qid);
408	if (cl == NULL)
409		return (ENOMEM);
410
411	if (clp != NULL)
412		*clp = cl;
413
414	return (0);
415}
416
417static struct qfq_class *
418qfq_class_create(struct qfq_if *qif, u_int32_t weight, u_int32_t qlimit,
419    u_int32_t flags, u_int32_t maxsz, u_int32_t qid)
420{
421	struct ifnet *ifp;
422	struct ifclassq *ifq;
423	struct qfq_group *grp;
424	struct qfq_class *cl;
425	u_int32_t w;			/* approximated weight */
426	int i;
427
428	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
429
430	/* Sanitize flags unless internally configured */
431	if (qif->qif_flags & QFQIFF_ALTQ)
432		flags &= QFCF_USERFLAGS;
433
434	if (qif->qif_classes >= qif->qif_maxclasses) {
435		log(LOG_ERR, "%s: %s out of classes! (max %d)\n",
436		    if_name(QFQIF_IFP(qif)), qfq_style(qif),
437		    qif->qif_maxclasses);
438		return (NULL);
439	}
440
441#if !CLASSQ_RED
442	if (flags & QFCF_RED) {
443		log(LOG_ERR, "%s: %s RED not available!\n",
444		    if_name(QFQIF_IFP(qif)), qfq_style(qif));
445		return (NULL);
446	}
447#endif /* !CLASSQ_RED */
448
449#if !CLASSQ_RIO
450	if (flags & QFCF_RIO) {
451		log(LOG_ERR, "%s: %s RIO not available!\n",
452		    if_name(QFQIF_IFP(qif)), qfq_style(qif));
453		return (NULL);
454	}
455#endif /* CLASSQ_RIO */
456
457#if !CLASSQ_BLUE
458	if (flags & QFCF_BLUE) {
459		log(LOG_ERR, "%s: %s BLUE not available!\n",
460		    if_name(QFQIF_IFP(qif)), qfq_style(qif));
461		return (NULL);
462	}
463#endif /* CLASSQ_BLUE */
464
465	/* These are mutually exclusive */
466	if ((flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) &&
467	    (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) != QFCF_RED &&
468	    (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) != QFCF_RIO &&
469	    (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) != QFCF_BLUE &&
470	    (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) != QFCF_SFB) {
471		log(LOG_ERR, "%s: %s more than one RED|RIO|BLUE|SFB\n",
472		    if_name(QFQIF_IFP(qif)), qfq_style(qif));
473		return (NULL);
474	}
475
476	ifq = qif->qif_ifq;
477	ifp = QFQIF_IFP(qif);
478
479	cl = zalloc(qfq_cl_zone);
480	if (cl == NULL)
481		return (NULL);
482
483	bzero(cl, qfq_cl_size);
484
485	if (qlimit == 0 || qlimit > IFCQ_MAXLEN(ifq)) {
486		qlimit = IFCQ_MAXLEN(ifq);
487		if (qlimit == 0)
488			qlimit = DEFAULT_QLIMIT;  /* use default */
489	}
490	_qinit(&cl->cl_q, Q_DROPTAIL, qlimit);
491	cl->cl_qif = qif;
492	cl->cl_flags = flags;
493	cl->cl_handle = qid;
494
495	/*
496	 * Find a free slot in the class table.  If the slot matching
497	 * the lower bits of qid is free, use this slot.  Otherwise,
498	 * use the first free slot.
499	 */
500	i = qid % qif->qif_maxclasses;
501	if (qif->qif_class_tbl[i] == NULL) {
502		qif->qif_class_tbl[i] = cl;
503	} else {
504		for (i = 0; i < qif->qif_maxclasses; i++) {
505			if (qif->qif_class_tbl[i] == NULL) {
506				qif->qif_class_tbl[i] = cl;
507				break;
508			}
509		}
510		if (i == qif->qif_maxclasses) {
511			zfree(qfq_cl_zone, cl);
512			return (NULL);
513		}
514	}
515
516	w = weight;
517	VERIFY(w > 0 && w <= QFQ_MAX_WEIGHT);
518	cl->cl_lmax = maxsz;
519	cl->cl_inv_w = (QFQ_ONE_FP / w);
520	w = (QFQ_ONE_FP / cl->cl_inv_w);
521	VERIFY(qif->qif_wsum + w <= QFQ_MAX_WSUM);
522
523	i = qfq_calc_index(cl, cl->cl_inv_w, cl->cl_lmax);
524	VERIFY(i <= QFQ_MAX_INDEX);
525	grp = qif->qif_groups[i];
526	if (grp == NULL) {
527		grp = _MALLOC(sizeof (*grp), M_DEVBUF, M_WAITOK|M_ZERO);
528		if (grp != NULL) {
529			grp->qfg_index = i;
530			grp->qfg_slot_shift =
531			    QFQ_MTU_SHIFT + QFQ_FRAC_BITS - (QFQ_MAX_INDEX - i);
532			grp->qfg_slots = _MALLOC(sizeof (struct qfq_class *) *
533			    qif->qif_maxslots, M_DEVBUF, M_WAITOK|M_ZERO);
534			if (grp->qfg_slots == NULL) {
535				log(LOG_ERR, "%s: %s unable to allocate group "
536				    "slots for index %d\n", if_name(ifp),
537				    qfq_style(qif), i);
538			}
539		} else {
540			log(LOG_ERR, "%s: %s unable to allocate group for "
541			    "qid=%d\n", if_name(ifp), qfq_style(qif),
542			    cl->cl_handle);
543		}
544		if (grp == NULL || grp->qfg_slots == NULL) {
545			qif->qif_class_tbl[qid % qif->qif_maxclasses] = NULL;
546			if (grp != NULL)
547				_FREE(grp, M_DEVBUF);
548			zfree(qfq_cl_zone, cl);
549			return (NULL);
550		} else {
551			qif->qif_groups[i] = grp;
552		}
553	}
554	cl->cl_grp = grp;
555	qif->qif_wsum += w;
556	/* XXX cl->cl_S = qif->qif_V; ? */
557	/* XXX compute qif->qif_i_wsum */
558
559	qif->qif_classes++;
560
561	if (flags & QFCF_DEFAULTCLASS)
562		qif->qif_default = cl;
563
564	if (flags & (QFCF_RED|QFCF_RIO|QFCF_BLUE|QFCF_SFB)) {
565#if CLASSQ_RED || CLASSQ_RIO
566		u_int64_t ifbandwidth = ifnet_output_linkrate(ifp);
567		int pkttime;
568#endif /* CLASSQ_RED || CLASSQ_RIO */
569
570		cl->cl_qflags = 0;
571		if (flags & QFCF_ECN) {
572			if (flags & QFCF_BLUE)
573				cl->cl_qflags |= BLUEF_ECN;
574			else if (flags & QFCF_SFB)
575				cl->cl_qflags |= SFBF_ECN;
576			else if (flags & QFCF_RED)
577				cl->cl_qflags |= REDF_ECN;
578			else if (flags & QFCF_RIO)
579				cl->cl_qflags |= RIOF_ECN;
580		}
581		if (flags & QFCF_FLOWCTL) {
582			if (flags & QFCF_SFB)
583				cl->cl_qflags |= SFBF_FLOWCTL;
584		}
585		if (flags & QFCF_DELAYBASED) {
586			if (flags & QFCF_SFB)
587				cl->cl_qflags |= SFBF_DELAYBASED;
588		}
589		if (flags & QFCF_CLEARDSCP) {
590			if (flags & QFCF_RIO)
591				cl->cl_qflags |= RIOF_CLEARDSCP;
592		}
593#if CLASSQ_RED || CLASSQ_RIO
594		/*
595		 * XXX: RED & RIO should be watching link speed and MTU
596		 *	events and recompute pkttime accordingly.
597		 */
598		if (ifbandwidth < 8)
599			pkttime = 1000 * 1000 * 1000; /* 1 sec */
600		else
601			pkttime = (int64_t)ifp->if_mtu * 1000 * 1000 * 1000 /
602			    (ifbandwidth / 8);
603
604		/* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */
605#if CLASSQ_RED
606		if (flags & QFCF_RED) {
607			cl->cl_red = red_alloc(ifp, 0, 0,
608			    qlimit(&cl->cl_q) * 10/100,
609			    qlimit(&cl->cl_q) * 30/100,
610			    cl->cl_qflags, pkttime);
611			if (cl->cl_red != NULL)
612				qtype(&cl->cl_q) = Q_RED;
613		}
614#endif /* CLASSQ_RED */
615#if CLASSQ_RIO
616		if (flags & QFCF_RIO) {
617			cl->cl_rio =
618			    rio_alloc(ifp, 0, NULL, cl->cl_qflags, pkttime);
619			if (cl->cl_rio != NULL)
620				qtype(&cl->cl_q) = Q_RIO;
621		}
622#endif /* CLASSQ_RIO */
623#endif /* CLASSQ_RED || CLASSQ_RIO */
624#if CLASSQ_BLUE
625		if (flags & QFCF_BLUE) {
626			cl->cl_blue = blue_alloc(ifp, 0, 0, cl->cl_qflags);
627			if (cl->cl_blue != NULL)
628				qtype(&cl->cl_q) = Q_BLUE;
629		}
630#endif /* CLASSQ_BLUE */
631		if (flags & QFCF_SFB) {
632			if (!(cl->cl_flags & QFCF_LAZY))
633				cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle,
634				    qlimit(&cl->cl_q), cl->cl_qflags);
635			if (cl->cl_sfb != NULL || (cl->cl_flags & QFCF_LAZY))
636				qtype(&cl->cl_q) = Q_SFB;
637		}
638	}
639
640	if (pktsched_verbose) {
641		log(LOG_DEBUG, "%s: %s created qid=%d grp=%d weight=%d "
642		    "qlimit=%d flags=%b\n", if_name(ifp), qfq_style(qif),
643		    cl->cl_handle, cl->cl_grp->qfg_index, weight, qlimit,
644		    flags, QFCF_BITS);
645	}
646
647	return (cl);
648}
649
650int
651qfq_remove_queue(struct qfq_if *qif, u_int32_t qid)
652{
653	struct qfq_class *cl;
654
655	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
656
657	if ((cl = qfq_clh_to_clp(qif, qid)) == NULL)
658		return (EINVAL);
659
660	return (qfq_class_destroy(qif, cl));
661}
662
663static int
664qfq_class_destroy(struct qfq_if *qif, struct qfq_class *cl)
665{
666	struct ifclassq *ifq = qif->qif_ifq;
667	int i;
668
669	IFCQ_LOCK_ASSERT_HELD(ifq);
670
671	qfq_purgeq(qif, cl, 0, NULL, NULL);
672
673	if (cl->cl_inv_w != 0) {
674		qif->qif_wsum -= (QFQ_ONE_FP / cl->cl_inv_w);
675		cl->cl_inv_w = 0;	/* reset weight to avoid run twice */
676	}
677
678	for (i = 0; i < qif->qif_maxclasses; i++) {
679		if (qif->qif_class_tbl[i] == cl) {
680			qif->qif_class_tbl[i] = NULL;
681			break;
682		}
683	}
684	qif->qif_classes--;
685
686	if (cl->cl_qalg.ptr != NULL) {
687#if CLASSQ_RIO
688		if (q_is_rio(&cl->cl_q))
689			rio_destroy(cl->cl_rio);
690#endif /* CLASSQ_RIO */
691#if CLASSQ_RED
692		if (q_is_red(&cl->cl_q))
693			red_destroy(cl->cl_red);
694#endif /* CLASSQ_RED */
695#if CLASSQ_BLUE
696		if (q_is_blue(&cl->cl_q))
697			blue_destroy(cl->cl_blue);
698#endif /* CLASSQ_BLUE */
699		if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL)
700			sfb_destroy(cl->cl_sfb);
701		cl->cl_qalg.ptr = NULL;
702		qtype(&cl->cl_q) = Q_DROPTAIL;
703		qstate(&cl->cl_q) = QS_RUNNING;
704	}
705
706	if (qif->qif_default == cl)
707		qif->qif_default = NULL;
708
709	if (pktsched_verbose) {
710		log(LOG_DEBUG, "%s: %s destroyed qid=%d\n",
711		    if_name(QFQIF_IFP(qif)), qfq_style(qif), cl->cl_handle);
712	}
713
714	zfree(qfq_cl_zone, cl);
715
716	return (0);
717}
718
719/*
720 * Calculate a mask to mimic what would be ffs_from()
721 */
722static inline pktsched_bitmap_t
723mask_from(pktsched_bitmap_t bitmap, int from)
724{
725	return (bitmap & ~((1UL << from) - 1));
726}
727
728/*
729 * The state computation relies on ER=0, IR=1, EB=2, IB=3
730 * First compute eligibility comparing grp->qfg_S, qif->qif_V,
731 * then check if someone is blocking us and possibly add EB
732 */
733static inline u_int32_t
734qfq_calc_state(struct qfq_if *qif, struct qfq_group *grp)
735{
736	/* if S > V we are not eligible */
737	u_int32_t state = qfq_gt(grp->qfg_S, qif->qif_V);
738	pktsched_bitmap_t mask = mask_from(qif->qif_bitmaps[ER],
739	    grp->qfg_index);
740	struct qfq_group *next;
741
742	if (mask) {
743		next = qfq_ffs(qif, mask);
744		if (qfq_gt(grp->qfg_F, next->qfg_F))
745			state |= EB;
746	}
747
748	return (state);
749}
750
751/*
752 * In principle
753 *	qif->qif_bitmaps[dst] |= qif->qif_bitmaps[src] & mask;
754 *	qif->qif_bitmaps[src] &= ~mask;
755 * but we should make sure that src != dst
756 */
757static inline void
758qfq_move_groups(struct qfq_if *qif, pktsched_bitmap_t mask, int src, int dst)
759{
760	qif->qif_bitmaps[dst] |= qif->qif_bitmaps[src] & mask;
761	qif->qif_bitmaps[src] &= ~mask;
762}
763
764static inline void
765qfq_unblock_groups(struct qfq_if *qif, int index, u_int64_t old_finish)
766{
767	pktsched_bitmap_t mask = mask_from(qif->qif_bitmaps[ER], index + 1);
768	struct qfq_group *next;
769
770	if (mask) {
771		next = qfq_ffs(qif, mask);
772		if (!qfq_gt(next->qfg_F, old_finish))
773			return;
774	}
775
776	mask = (1UL << index) - 1;
777	qfq_move_groups(qif, mask, EB, ER);
778	qfq_move_groups(qif, mask, IB, IR);
779}
780
781/*
782 * perhaps
783 *
784 *	old_V ^= qif->qif_V;
785 *	old_V >>= QFQ_MIN_SLOT_SHIFT;
786 *	if (old_V) {
787 *		...
788 *	}
789 */
790static inline void
791qfq_make_eligible(struct qfq_if *qif, u_int64_t old_V)
792{
793	pktsched_bitmap_t mask, vslot, old_vslot;
794
795	vslot = qif->qif_V >> QFQ_MIN_SLOT_SHIFT;
796	old_vslot = old_V >> QFQ_MIN_SLOT_SHIFT;
797
798	if (vslot != old_vslot) {
799		mask = (2UL << (__fls(vslot ^ old_vslot))) - 1;
800		qfq_move_groups(qif, mask, IR, ER);
801		qfq_move_groups(qif, mask, IB, EB);
802	}
803}
804
805/*
806 * XXX we should make sure that slot becomes less than 32.
807 * This is guaranteed by the input values.
808 * roundedS is always cl->qfg_S rounded on grp->qfg_slot_shift bits.
809 */
810static inline void
811qfq_slot_insert(struct qfq_if *qif, struct qfq_group *grp,
812    struct qfq_class *cl, u_int64_t roundedS)
813{
814	u_int64_t slot = (roundedS - grp->qfg_S) >> grp->qfg_slot_shift;
815	u_int32_t i = (grp->qfg_front + slot) % qif->qif_maxslots;
816
817	cl->cl_next = grp->qfg_slots[i];
818	grp->qfg_slots[i] = cl;
819	pktsched_bit_set(slot, &grp->qfg_full_slots);
820}
821
822/*
823 * remove the entry from the slot
824 */
825static inline void
826qfq_front_slot_remove(struct qfq_group *grp)
827{
828	struct qfq_class **h = &grp->qfg_slots[grp->qfg_front];
829
830	*h = (*h)->cl_next;
831	if (!*h)
832		pktsched_bit_clr(0, &grp->qfg_full_slots);
833}
834
835/*
836 * Returns the first full queue in a group. As a side effect,
837 * adjust the bucket list so the first non-empty bucket is at
838 * position 0 in qfg_full_slots.
839 */
840static inline struct qfq_class *
841qfq_slot_scan(struct qfq_if *qif, struct qfq_group *grp)
842{
843	int i;
844
845	if (pktsched_verbose > 2) {
846		log(LOG_DEBUG, "%s: %s grp=%d full_slots=0x%x\n",
847		    if_name(QFQIF_IFP(qif)), qfq_style(qif), grp->qfg_index,
848		    grp->qfg_full_slots);
849	}
850
851	if (grp->qfg_full_slots == 0)
852		return (NULL);
853
854	i = pktsched_ffs(grp->qfg_full_slots) - 1; /* zero-based */
855	if (i > 0) {
856		grp->qfg_front = (grp->qfg_front + i) % qif->qif_maxslots;
857		grp->qfg_full_slots >>= i;
858	}
859
860	return (grp->qfg_slots[grp->qfg_front]);
861}
862
863/*
864 * adjust the bucket list. When the start time of a group decreases,
865 * we move the index down (modulo qif->qif_maxslots) so we don't need to
866 * move the objects. The mask of occupied slots must be shifted
867 * because we use ffs() to find the first non-empty slot.
868 * This covers decreases in the group's start time, but what about
869 * increases of the start time ?
870 * Here too we should make sure that i is less than 32
871 */
872static inline void
873qfq_slot_rotate(struct qfq_if *qif, struct qfq_group *grp, u_int64_t roundedS)
874{
875#pragma unused(qif)
876	u_int32_t i = (grp->qfg_S - roundedS) >> grp->qfg_slot_shift;
877
878	grp->qfg_full_slots <<= i;
879	grp->qfg_front = (grp->qfg_front - i) % qif->qif_maxslots;
880}
881
882static inline void
883qfq_update_eligible(struct qfq_if *qif, u_int64_t old_V)
884{
885	pktsched_bitmap_t ineligible;
886
887	ineligible = qif->qif_bitmaps[IR] | qif->qif_bitmaps[IB];
888	if (ineligible) {
889		if (!qif->qif_bitmaps[ER]) {
890			struct qfq_group *grp;
891			grp = qfq_ffs(qif, ineligible);
892			if (qfq_gt(grp->qfg_S, qif->qif_V))
893				qif->qif_V = grp->qfg_S;
894		}
895		qfq_make_eligible(qif, old_V);
896	}
897}
898
899/*
900 * Updates the class, returns true if also the group needs to be updated.
901 */
902static inline int
903qfq_update_class(struct qfq_if *qif, struct qfq_group *grp,
904    struct qfq_class *cl)
905{
906#pragma unused(qif)
907	cl->cl_S = cl->cl_F;
908	if (qempty(&cl->cl_q))  {
909		qfq_front_slot_remove(grp);
910	} else {
911		u_int32_t len;
912		u_int64_t roundedS;
913
914		len = m_pktlen(qhead(&cl->cl_q));
915		cl->cl_F = cl->cl_S + (u_int64_t)len * cl->cl_inv_w;
916		roundedS = qfq_round_down(cl->cl_S, grp->qfg_slot_shift);
917		if (roundedS == grp->qfg_S)
918			return (0);
919
920		qfq_front_slot_remove(grp);
921		qfq_slot_insert(qif, grp, cl, roundedS);
922	}
923	return (1);
924}
925
926/*
927 * note: CLASSQDQ_POLL returns the next packet without removing the packet
928 *	from the queue.  CLASSQDQ_REMOVE is a normal dequeue operation.
929 *	CLASSQDQ_REMOVE must return the same packet if called immediately
930 *	after CLASSQDQ_POLL.
931 */
932struct mbuf *
933qfq_dequeue(struct qfq_if *qif, cqdq_op_t op)
934{
935	pktsched_bitmap_t er_bits = qif->qif_bitmaps[ER];
936	struct ifclassq *ifq = qif->qif_ifq;
937	struct qfq_group *grp;
938	struct qfq_class *cl;
939	struct mbuf *m;
940	u_int64_t old_V;
941	u_int32_t len;
942
943	IFCQ_LOCK_ASSERT_HELD(ifq);
944
945	for (;;) {
946		if (er_bits == 0) {
947#if QFQ_DEBUG
948			if (qif->qif_queued && pktsched_verbose > 1)
949				qfq_dump_sched(qif, "start dequeue");
950#endif /* QFQ_DEBUG */
951			/* no eligible and ready packet */
952			return (NULL);
953		}
954		grp = qfq_ffs(qif, er_bits);
955		/* if group is non-empty, use it */
956		if (grp->qfg_full_slots != 0)
957			break;
958		pktsched_bit_clr(grp->qfg_index, &er_bits);
959#if QFQ_DEBUG
960		qif->qif_emptygrp++;
961#endif /* QFQ_DEBUG */
962	}
963	VERIFY(!IFCQ_IS_EMPTY(ifq));
964
965	cl = grp->qfg_slots[grp->qfg_front];
966	VERIFY(cl != NULL && !qempty(&cl->cl_q));
967
968	if (op == CLASSQDQ_POLL)
969		return (qfq_pollq(cl));
970
971	m = qfq_getq(cl);
972	VERIFY(m != NULL);	/* qalg must be work conserving */
973	len = m_pktlen(m);
974
975#if QFQ_DEBUG
976	qif->qif_queued--;
977#endif /* QFQ_DEBUG */
978
979	IFCQ_DEC_LEN(ifq);
980	if (qempty(&cl->cl_q))
981		cl->cl_period++;
982	PKTCNTR_ADD(&cl->cl_xmitcnt, 1, len);
983	IFCQ_XMIT_ADD(ifq, 1, len);
984
985	old_V = qif->qif_V;
986	qif->qif_V += (u_int64_t)len * QFQ_IWSUM;
987
988	if (pktsched_verbose > 2) {
989		log(LOG_DEBUG, "%s: %s qid=%d dequeue m=0x%llx F=0x%llx "
990		    "V=0x%llx", if_name(QFQIF_IFP(qif)), qfq_style(qif),
991		    cl->cl_handle, (uint64_t)VM_KERNEL_ADDRPERM(m), cl->cl_F,
992		    qif->qif_V);
993	}
994
995	if (qfq_update_class(qif, grp, cl)) {
996		u_int64_t old_F = grp->qfg_F;
997
998		cl = qfq_slot_scan(qif, grp);
999		if (!cl) { /* group gone, remove from ER */
1000			pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[ER]);
1001		} else {
1002			u_int32_t s;
1003			u_int64_t roundedS =
1004			    qfq_round_down(cl->cl_S, grp->qfg_slot_shift);
1005
1006			if (grp->qfg_S == roundedS)
1007				goto skip_unblock;
1008
1009			grp->qfg_S = roundedS;
1010			grp->qfg_F = roundedS + (2ULL << grp->qfg_slot_shift);
1011
1012			/* remove from ER and put in the new set */
1013			pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[ER]);
1014			s = qfq_calc_state(qif, grp);
1015			pktsched_bit_set(grp->qfg_index, &qif->qif_bitmaps[s]);
1016		}
1017		/* we need to unblock even if the group has gone away */
1018		qfq_unblock_groups(qif, grp->qfg_index, old_F);
1019	}
1020
1021skip_unblock:
1022	qfq_update_eligible(qif, old_V);
1023
1024#if QFQ_DEBUG
1025	if (!qif->qif_bitmaps[ER] && qif->qif_queued && pktsched_verbose > 1)
1026		qfq_dump_sched(qif, "end dequeue");
1027#endif /* QFQ_DEBUG */
1028
1029	return (m);
1030}
1031
1032/*
1033 * Assign a reasonable start time for a new flow k in group i.
1034 * Admissible values for hat(F) are multiples of sigma_i
1035 * no greater than V+sigma_i . Larger values mean that
1036 * we had a wraparound so we consider the timestamp to be stale.
1037 *
1038 * If F is not stale and F >= V then we set S = F.
1039 * Otherwise we should assign S = V, but this may violate
1040 * the ordering in ER. So, if we have groups in ER, set S to
1041 * the F_j of the first group j which would be blocking us.
1042 * We are guaranteed not to move S backward because
1043 * otherwise our group i would still be blocked.
1044 */
1045static inline void
1046qfq_update_start(struct qfq_if *qif, struct qfq_class *cl)
1047{
1048	pktsched_bitmap_t mask;
1049	u_int64_t limit, roundedF;
1050	int slot_shift = cl->cl_grp->qfg_slot_shift;
1051
1052	roundedF = qfq_round_down(cl->cl_F, slot_shift);
1053	limit = qfq_round_down(qif->qif_V, slot_shift) + (1UL << slot_shift);
1054
1055	if (!qfq_gt(cl->cl_F, qif->qif_V) || qfq_gt(roundedF, limit)) {
1056		/* timestamp was stale */
1057		mask = mask_from(qif->qif_bitmaps[ER], cl->cl_grp->qfg_index);
1058		if (mask) {
1059			struct qfq_group *next = qfq_ffs(qif, mask);
1060			if (qfq_gt(roundedF, next->qfg_F)) {
1061				cl->cl_S = next->qfg_F;
1062				return;
1063			}
1064		}
1065		cl->cl_S = qif->qif_V;
1066	} else { /* timestamp is not stale */
1067		cl->cl_S = cl->cl_F;
1068	}
1069}
1070
1071int
1072qfq_enqueue(struct qfq_if *qif, struct qfq_class *cl, struct mbuf *m,
1073    struct pf_mtag *t)
1074{
1075	struct ifclassq *ifq = qif->qif_ifq;
1076	struct qfq_group *grp;
1077	u_int64_t roundedS;
1078	int len, ret, s;
1079
1080	IFCQ_LOCK_ASSERT_HELD(ifq);
1081	VERIFY(cl == NULL || cl->cl_qif == qif);
1082
1083	if (cl == NULL) {
1084#if PF_ALTQ
1085		cl = qfq_clh_to_clp(qif, t->pftag_qid);
1086#else /* !PF_ALTQ */
1087		cl = qfq_clh_to_clp(qif, 0);
1088#endif /* !PF_ALTQ */
1089		if (cl == NULL) {
1090			cl = qif->qif_default;
1091			if (cl == NULL) {
1092				IFCQ_CONVERT_LOCK(ifq);
1093				m_freem(m);
1094				return (ENOBUFS);
1095			}
1096		}
1097	}
1098
1099	len = m_pktlen(m);
1100
1101	ret = qfq_addq(cl, m, t);
1102	if (ret != 0) {
1103		if (ret == CLASSQEQ_SUCCESS_FC) {
1104			/* packet enqueued, return advisory feedback */
1105			ret = EQFULL;
1106		} else {
1107			VERIFY(ret == CLASSQEQ_DROPPED ||
1108			    ret == CLASSQEQ_DROPPED_FC ||
1109			    ret == CLASSQEQ_DROPPED_SP);
1110			/* packet has been freed in qfq_addq */
1111			PKTCNTR_ADD(&cl->cl_dropcnt, 1, len);
1112			IFCQ_DROP_ADD(ifq, 1, len);
1113			switch (ret) {
1114			case CLASSQEQ_DROPPED:
1115				return (ENOBUFS);
1116			case CLASSQEQ_DROPPED_FC:
1117				return (EQFULL);
1118			case CLASSQEQ_DROPPED_SP:
1119				return (EQSUSPENDED);
1120			}
1121			/* NOT REACHED */
1122		}
1123	}
1124	IFCQ_INC_LEN(ifq);
1125
1126#if QFQ_DEBUG
1127	qif->qif_queued++;
1128#endif /* QFQ_DEBUG */
1129
1130	/* queue was not idle, we're done */
1131	if (qlen(&cl->cl_q) > 1)
1132		goto done;
1133
1134	/* queue was idle */
1135	grp = cl->cl_grp;
1136	qfq_update_start(qif, cl);	/* adjust start time */
1137
1138	/* compute new finish time and rounded start */
1139	cl->cl_F = cl->cl_S + (u_int64_t)len * cl->cl_inv_w;
1140	roundedS = qfq_round_down(cl->cl_S, grp->qfg_slot_shift);
1141
1142	/*
1143	 * Insert cl in the correct bucket.
1144	 *
1145	 * If cl->cl_S >= grp->qfg_S we don't need to adjust the bucket list
1146	 * and simply go to the insertion phase.  Otherwise grp->qfg_S is
1147	 * decreasing, we must make room in the bucket list, and also
1148	 * recompute the group state.  Finally, if there were no flows
1149	 * in this group and nobody was in ER make sure to adjust V.
1150	 */
1151	if (grp->qfg_full_slots != 0) {
1152		if (!qfq_gt(grp->qfg_S, cl->cl_S))
1153			goto skip_update;
1154
1155		/* create a slot for this cl->cl_S */
1156		qfq_slot_rotate(qif, grp, roundedS);
1157
1158		/* group was surely ineligible, remove */
1159		pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IR]);
1160		pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IB]);
1161	} else if (!qif->qif_bitmaps[ER] && qfq_gt(roundedS, qif->qif_V)) {
1162		qif->qif_V = roundedS;
1163	}
1164
1165	grp->qfg_S = roundedS;
1166	grp->qfg_F =
1167	    roundedS + (2ULL << grp->qfg_slot_shift); /* i.e. 2 sigma_i */
1168	s = qfq_calc_state(qif, grp);
1169	pktsched_bit_set(grp->qfg_index, &qif->qif_bitmaps[s]);
1170
1171	if (pktsched_verbose > 2) {
1172		log(LOG_DEBUG, "%s: %s qid=%d enqueue m=0x%llx state=%s 0x%x "
1173		    "S=0x%llx F=0x%llx V=0x%llx\n", if_name(QFQIF_IFP(qif)),
1174		    qfq_style(qif), cl->cl_handle,
1175		    (uint64_t)VM_KERNEL_ADDRPERM(m), qfq_state2str(s),
1176		    qif->qif_bitmaps[s], cl->cl_S, cl->cl_F, qif->qif_V);
1177	}
1178
1179skip_update:
1180	qfq_slot_insert(qif, grp, cl, roundedS);
1181
1182done:
1183	/* successfully queued. */
1184	return (ret);
1185}
1186
1187static inline void
1188qfq_slot_remove(struct qfq_if *qif, struct qfq_group *grp,
1189    struct qfq_class *cl)
1190{
1191#pragma unused(qif)
1192	struct qfq_class **pprev;
1193	u_int32_t i, offset;
1194	u_int64_t roundedS;
1195
1196	roundedS = qfq_round_down(cl->cl_S, grp->qfg_slot_shift);
1197	offset = (roundedS - grp->qfg_S) >> grp->qfg_slot_shift;
1198	i = (grp->qfg_front + offset) % qif->qif_maxslots;
1199
1200	pprev = &grp->qfg_slots[i];
1201	while (*pprev && *pprev != cl)
1202		pprev = &(*pprev)->cl_next;
1203
1204	*pprev = cl->cl_next;
1205	if (!grp->qfg_slots[i])
1206		pktsched_bit_clr(offset, &grp->qfg_full_slots);
1207}
1208
1209/*
1210 * Called to forcibly destroy a queue.
1211 * If the queue is not in the front bucket, or if it has
1212 * other queues in the front bucket, we can simply remove
1213 * the queue with no other side effects.
1214 * Otherwise we must propagate the event up.
1215 * XXX description to be completed.
1216 */
1217static void
1218qfq_deactivate_class(struct qfq_if *qif, struct qfq_class *cl)
1219{
1220	struct qfq_group *grp = cl->cl_grp;
1221	pktsched_bitmap_t mask;
1222	u_int64_t roundedS;
1223	int s;
1224
1225	if (pktsched_verbose) {
1226		log(LOG_DEBUG, "%s: %s deactivate qid=%d grp=%d "
1227		    "full_slots=0x%x front=%d bitmaps={ER=0x%x,EB=0x%x,"
1228		    "IR=0x%x,IB=0x%x}\n",
1229		    if_name(QFQIF_IFP(cl->cl_qif)), qfq_style(cl->cl_qif),
1230		    cl->cl_handle, grp->qfg_index, grp->qfg_full_slots,
1231		    grp->qfg_front, qif->qif_bitmaps[ER], qif->qif_bitmaps[EB],
1232		    qif->qif_bitmaps[IR], qif->qif_bitmaps[IB]);
1233#if QFQ_DEBUG
1234		if (pktsched_verbose > 1)
1235			qfq_dump_sched(qif, "start deactivate");
1236#endif /* QFQ_DEBUG */
1237	}
1238
1239	cl->cl_F = cl->cl_S;	/* not needed if the class goes away */
1240	qfq_slot_remove(qif, grp, cl);
1241
1242	if (grp->qfg_full_slots == 0) {
1243		/*
1244		 * Nothing left in the group, remove from all sets.
1245		 * Do ER last because if we were blocking other groups
1246		 * we must unblock them.
1247		 */
1248		pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IR]);
1249		pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[EB]);
1250		pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IB]);
1251
1252		if (pktsched_bit_tst(grp->qfg_index, &qif->qif_bitmaps[ER]) &&
1253		    !(qif->qif_bitmaps[ER] & ~((1UL << grp->qfg_index) - 1))) {
1254			mask = qif->qif_bitmaps[ER] &
1255			    ((1UL << grp->qfg_index) - 1);
1256			if (mask)
1257				mask = ~((1UL << __fls(mask)) - 1);
1258			else
1259				mask = (pktsched_bitmap_t)~0UL;
1260			qfq_move_groups(qif, mask, EB, ER);
1261			qfq_move_groups(qif, mask, IB, IR);
1262		}
1263		pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[ER]);
1264	} else if (!grp->qfg_slots[grp->qfg_front]) {
1265		cl = qfq_slot_scan(qif, grp);
1266		roundedS = qfq_round_down(cl->cl_S, grp->qfg_slot_shift);
1267		if (grp->qfg_S != roundedS) {
1268			pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[ER]);
1269			pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IR]);
1270			pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[EB]);
1271			pktsched_bit_clr(grp->qfg_index, &qif->qif_bitmaps[IB]);
1272			grp->qfg_S = roundedS;
1273			grp->qfg_F = roundedS + (2ULL << grp->qfg_slot_shift);
1274			s = qfq_calc_state(qif, grp);
1275			pktsched_bit_set(grp->qfg_index, &qif->qif_bitmaps[s]);
1276		}
1277	}
1278	qfq_update_eligible(qif, qif->qif_V);
1279
1280#if QFQ_DEBUG
1281	if (pktsched_verbose > 1)
1282		qfq_dump_sched(qif, "end deactivate");
1283#endif /* QFQ_DEBUG */
1284}
1285
1286static const char *
1287qfq_state2str(int s)
1288{
1289	const char *c;
1290
1291	switch (s) {
1292	case ER:
1293		c = "ER";
1294		break;
1295	case IR:
1296		c = "IR";
1297		break;
1298	case EB:
1299		c = "EB";
1300		break;
1301	case IB:
1302		c = "IB";
1303		break;
1304	default:
1305		c = "?";
1306		break;
1307	}
1308	return (c);
1309}
1310
1311static inline int
1312qfq_addq(struct qfq_class *cl, struct mbuf *m, struct pf_mtag *t)
1313{
1314	struct qfq_if	*qif = cl->cl_qif;
1315	struct ifclassq *ifq = qif->qif_ifq;
1316
1317	IFCQ_LOCK_ASSERT_HELD(ifq);
1318
1319#if CLASSQ_RIO
1320	if (q_is_rio(&cl->cl_q))
1321		return (rio_addq(cl->cl_rio, &cl->cl_q, m, t));
1322	else
1323#endif /* CLASSQ_RIO */
1324#if CLASSQ_RED
1325	if (q_is_red(&cl->cl_q))
1326		return (red_addq(cl->cl_red, &cl->cl_q, m, t));
1327	else
1328#endif /* CLASSQ_RED */
1329#if CLASSQ_BLUE
1330	if (q_is_blue(&cl->cl_q))
1331		return (blue_addq(cl->cl_blue, &cl->cl_q, m, t));
1332	else
1333#endif /* CLASSQ_BLUE */
1334	if (q_is_sfb(&cl->cl_q)) {
1335		if (cl->cl_sfb == NULL) {
1336			struct ifnet *ifp = QFQIF_IFP(qif);
1337
1338			VERIFY(cl->cl_flags & QFCF_LAZY);
1339			cl->cl_flags &= ~QFCF_LAZY;
1340			IFCQ_CONVERT_LOCK(ifq);
1341
1342			cl->cl_sfb = sfb_alloc(ifp, cl->cl_handle,
1343			    qlimit(&cl->cl_q), cl->cl_qflags);
1344			if (cl->cl_sfb == NULL) {
1345				/* fall back to droptail */
1346				qtype(&cl->cl_q) = Q_DROPTAIL;
1347				cl->cl_flags &= ~QFCF_SFB;
1348				cl->cl_qflags &= ~(SFBF_ECN | SFBF_FLOWCTL);
1349
1350				log(LOG_ERR, "%s: %s SFB lazy allocation "
1351				    "failed for qid=%d grp=%d, falling back "
1352				    "to DROPTAIL\n", if_name(ifp),
1353				    qfq_style(qif), cl->cl_handle,
1354				    cl->cl_grp->qfg_index);
1355			} else if (qif->qif_throttle != IFNET_THROTTLE_OFF) {
1356				/* if there's pending throttling, set it */
1357				cqrq_throttle_t tr = { 1, qif->qif_throttle };
1358				int err = qfq_throttle(qif, &tr);
1359
1360				if (err == EALREADY)
1361					err = 0;
1362				if (err != 0) {
1363					tr.level = IFNET_THROTTLE_OFF;
1364					(void) qfq_throttle(qif, &tr);
1365				}
1366			}
1367		}
1368		if (cl->cl_sfb != NULL)
1369			return (sfb_addq(cl->cl_sfb, &cl->cl_q, m, t));
1370	} else if (qlen(&cl->cl_q) >= qlimit(&cl->cl_q)) {
1371		IFCQ_CONVERT_LOCK(ifq);
1372		m_freem(m);
1373		return (CLASSQEQ_DROPPED);
1374	}
1375
1376#if PF_ECN
1377	if (cl->cl_flags & QFCF_CLEARDSCP)
1378		write_dsfield(m, t, 0);
1379#endif /* PF_ECN */
1380
1381	_addq(&cl->cl_q, m);
1382
1383	return (0);
1384}
1385
1386static inline struct mbuf *
1387qfq_getq(struct qfq_class *cl)
1388{
1389	IFCQ_LOCK_ASSERT_HELD(cl->cl_qif->qif_ifq);
1390
1391#if CLASSQ_RIO
1392	if (q_is_rio(&cl->cl_q))
1393		return (rio_getq(cl->cl_rio, &cl->cl_q));
1394	else
1395#endif /* CLASSQ_RIO */
1396#if CLASSQ_RED
1397	if (q_is_red(&cl->cl_q))
1398		return (red_getq(cl->cl_red, &cl->cl_q));
1399	else
1400#endif /* CLASSQ_RED */
1401#if CLASSQ_BLUE
1402	if (q_is_blue(&cl->cl_q))
1403		return (blue_getq(cl->cl_blue, &cl->cl_q));
1404	else
1405#endif /* CLASSQ_BLUE */
1406	if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL)
1407		return (sfb_getq(cl->cl_sfb, &cl->cl_q));
1408
1409	return (_getq(&cl->cl_q));
1410}
1411
1412static inline struct mbuf *
1413qfq_pollq(struct qfq_class *cl)
1414{
1415	IFCQ_LOCK_ASSERT_HELD(cl->cl_qif->qif_ifq);
1416
1417	return (qhead(&cl->cl_q));
1418}
1419
1420static void
1421qfq_purgeq(struct qfq_if *qif, struct qfq_class *cl, u_int32_t flow,
1422    u_int32_t *packets, u_int32_t *bytes)
1423{
1424	struct ifclassq *ifq = qif->qif_ifq;
1425	u_int32_t cnt = 0, len = 0, qlen;
1426
1427	IFCQ_LOCK_ASSERT_HELD(ifq);
1428
1429	if ((qlen = qlen(&cl->cl_q)) == 0)
1430		goto done;
1431
1432	/* become regular mutex before freeing mbufs */
1433	IFCQ_CONVERT_LOCK(ifq);
1434
1435#if CLASSQ_RIO
1436	if (q_is_rio(&cl->cl_q))
1437		rio_purgeq(cl->cl_rio, &cl->cl_q, flow, &cnt, &len);
1438	else
1439#endif /* CLASSQ_RIO */
1440#if CLASSQ_RED
1441	if (q_is_red(&cl->cl_q))
1442		red_purgeq(cl->cl_red, &cl->cl_q, flow, &cnt, &len);
1443	else
1444#endif /* CLASSQ_RED */
1445#if CLASSQ_BLUE
1446	if (q_is_blue(&cl->cl_q))
1447		blue_purgeq(cl->cl_blue, &cl->cl_q, flow, &cnt, &len);
1448	else
1449#endif /* CLASSQ_BLUE */
1450	if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL)
1451		sfb_purgeq(cl->cl_sfb, &cl->cl_q, flow, &cnt, &len);
1452	else
1453		_flushq_flow(&cl->cl_q, flow, &cnt, &len);
1454
1455	if (cnt > 0) {
1456		VERIFY(qlen(&cl->cl_q) == (qlen - cnt));
1457#if QFQ_DEBUG
1458		VERIFY(qif->qif_queued >= cnt);
1459		qif->qif_queued -= cnt;
1460#endif /* QFQ_DEBUG */
1461
1462		PKTCNTR_ADD(&cl->cl_dropcnt, cnt, len);
1463		IFCQ_DROP_ADD(ifq, cnt, len);
1464
1465		VERIFY(((signed)IFCQ_LEN(ifq) - cnt) >= 0);
1466		IFCQ_LEN(ifq) -= cnt;
1467
1468		if (qempty(&cl->cl_q))
1469			qfq_deactivate_class(qif, cl);
1470
1471		if (pktsched_verbose) {
1472			log(LOG_DEBUG, "%s: %s purge qid=%d weight=%d "
1473			    "qlen=[%d,%d] cnt=%d len=%d flow=0x%x\n",
1474			    if_name(QFQIF_IFP(qif)),
1475			    qfq_style(qif), cl->cl_handle,
1476			    (u_int32_t)(QFQ_ONE_FP / cl->cl_inv_w), qlen,
1477			    qlen(&cl->cl_q), cnt, len, flow);
1478		}
1479	}
1480done:
1481	if (packets != NULL)
1482		*packets = cnt;
1483	if (bytes != NULL)
1484		*bytes = len;
1485}
1486
1487static void
1488qfq_updateq(struct qfq_if *qif, struct qfq_class *cl, cqev_t ev)
1489{
1490	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
1491
1492	if (pktsched_verbose) {
1493		log(LOG_DEBUG, "%s: %s update qid=%d weight=%d event=%s\n",
1494		    if_name(QFQIF_IFP(qif)), qfq_style(qif),
1495		    cl->cl_handle, (u_int32_t)(QFQ_ONE_FP / cl->cl_inv_w),
1496		    ifclassq_ev2str(ev));
1497	}
1498
1499#if CLASSQ_RIO
1500	if (q_is_rio(&cl->cl_q))
1501		return (rio_updateq(cl->cl_rio, ev));
1502#endif /* CLASSQ_RIO */
1503#if CLASSQ_RED
1504	if (q_is_red(&cl->cl_q))
1505		return (red_updateq(cl->cl_red, ev));
1506#endif /* CLASSQ_RED */
1507#if CLASSQ_BLUE
1508	if (q_is_blue(&cl->cl_q))
1509		return (blue_updateq(cl->cl_blue, ev));
1510#endif /* CLASSQ_BLUE */
1511	if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL)
1512		return (sfb_updateq(cl->cl_sfb, ev));
1513}
1514
1515int
1516qfq_get_class_stats(struct qfq_if *qif, u_int32_t qid,
1517    struct qfq_classstats *sp)
1518{
1519	struct qfq_class *cl;
1520
1521	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
1522
1523	if ((cl = qfq_clh_to_clp(qif, qid)) == NULL)
1524		return (EINVAL);
1525
1526	sp->class_handle = cl->cl_handle;
1527	sp->index = cl->cl_grp->qfg_index;
1528	sp->weight = (QFQ_ONE_FP / cl->cl_inv_w);
1529	sp->lmax = cl->cl_lmax;
1530	sp->qlength = qlen(&cl->cl_q);
1531	sp->qlimit = qlimit(&cl->cl_q);
1532	sp->period = cl->cl_period;
1533	sp->xmitcnt = cl->cl_xmitcnt;
1534	sp->dropcnt = cl->cl_dropcnt;
1535
1536	sp->qtype = qtype(&cl->cl_q);
1537	sp->qstate = qstate(&cl->cl_q);
1538#if CLASSQ_RED
1539	if (q_is_red(&cl->cl_q))
1540		red_getstats(cl->cl_red, &sp->red[0]);
1541#endif /* CLASSQ_RED */
1542#if CLASSQ_RIO
1543	if (q_is_rio(&cl->cl_q))
1544		rio_getstats(cl->cl_rio, &sp->red[0]);
1545#endif /* CLASSQ_RIO */
1546#if CLASSQ_BLUE
1547	if (q_is_blue(&cl->cl_q))
1548		blue_getstats(cl->cl_blue, &sp->blue);
1549#endif /* CLASSQ_BLUE */
1550	if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL)
1551		sfb_getstats(cl->cl_sfb, &sp->sfb);
1552
1553	return (0);
1554}
1555
1556static int
1557qfq_stat_sc(struct qfq_if *qif, cqrq_stat_sc_t *sr)
1558{
1559	struct ifclassq *ifq = qif->qif_ifq;
1560	struct qfq_class *cl;
1561	u_int32_t i;
1562
1563	IFCQ_LOCK_ASSERT_HELD(ifq);
1564
1565	VERIFY(sr->sc == MBUF_SC_UNSPEC || MBUF_VALID_SC(sr->sc));
1566
1567	i = MBUF_SCIDX(sr->sc);
1568	VERIFY(i < IFCQ_SC_MAX);
1569
1570	cl = ifq->ifcq_disc_slots[i].cl;
1571	sr->packets = qlen(&cl->cl_q);
1572	sr->bytes = qsize(&cl->cl_q);
1573
1574	return (0);
1575}
1576
1577/* convert a class handle to the corresponding class pointer */
1578static inline struct qfq_class *
1579qfq_clh_to_clp(struct qfq_if *qif, u_int32_t chandle)
1580{
1581	struct qfq_class *cl;
1582	int i;
1583
1584	IFCQ_LOCK_ASSERT_HELD(qif->qif_ifq);
1585
1586	/*
1587	 * First, try optimistically the slot matching the lower bits of
1588	 * the handle.  If it fails, do the linear table search.
1589	 */
1590	i = chandle % qif->qif_maxclasses;
1591	if ((cl = qif->qif_class_tbl[i]) != NULL && cl->cl_handle == chandle)
1592		return (cl);
1593	for (i = 0; i < qif->qif_maxclasses; i++)
1594		if ((cl = qif->qif_class_tbl[i]) != NULL &&
1595		    cl->cl_handle == chandle)
1596			return (cl);
1597
1598	return (NULL);
1599}
1600
1601static const char *
1602qfq_style(struct qfq_if *qif)
1603{
1604	return ((qif->qif_flags & QFQIFF_ALTQ) ? "ALTQ_QFQ" : "QFQ");
1605}
1606
1607/*
1608 * Generic comparison function, handling wraparound
1609 */
1610static inline int
1611qfq_gt(u_int64_t a, u_int64_t b)
1612{
1613	return ((int64_t)(a - b) > 0);
1614}
1615
1616/*
1617 * Round a precise timestamp to its slotted value
1618 */
1619static inline u_int64_t
1620qfq_round_down(u_int64_t ts, u_int32_t shift)
1621{
1622	return (ts & ~((1ULL << shift) - 1));
1623}
1624
1625/*
1626 * Return the pointer to the group with lowest index in the bitmap
1627 */
1628static inline struct qfq_group *
1629qfq_ffs(struct qfq_if *qif, pktsched_bitmap_t bitmap)
1630{
1631	int index = pktsched_ffs(bitmap) - 1;	/* zero-based */
1632	VERIFY(index >= 0 && index <= QFQ_MAX_INDEX &&
1633	    qif->qif_groups[index] != NULL);
1634	return (qif->qif_groups[index]);
1635}
1636
1637/*
1638 * Calculate a flow index, given its weight and maximum packet length.
1639 * index = log_2(maxlen/weight) but we need to apply the scaling.
1640 * This is used only once at flow creation.
1641 */
1642static int
1643qfq_calc_index(struct qfq_class *cl, u_int32_t inv_w, u_int32_t maxlen)
1644{
1645	u_int64_t slot_size = (u_int64_t)maxlen *inv_w;
1646	pktsched_bitmap_t size_map;
1647	int index = 0;
1648
1649	size_map = (pktsched_bitmap_t)(slot_size >> QFQ_MIN_SLOT_SHIFT);
1650	if (!size_map)
1651		goto out;
1652
1653	index = __fls(size_map) + 1;	/* basically a log_2() */
1654	index -= !(slot_size - (1ULL << (index + QFQ_MIN_SLOT_SHIFT - 1)));
1655
1656	if (index < 0)
1657		index = 0;
1658out:
1659	if (pktsched_verbose) {
1660		log(LOG_DEBUG, "%s: %s qid=%d grp=%d W=%u, L=%u, I=%d\n",
1661		    if_name(QFQIF_IFP(cl->cl_qif)), qfq_style(cl->cl_qif),
1662		    cl->cl_handle, index, (u_int32_t)(QFQ_ONE_FP/inv_w),
1663		    maxlen, index);
1664	}
1665	return (index);
1666}
1667
1668#if QFQ_DEBUG
1669static void
1670qfq_dump_groups(struct qfq_if *qif, u_int32_t mask)
1671{
1672	int i, j;
1673
1674	for (i = 0; i < QFQ_MAX_INDEX + 1; i++) {
1675		struct qfq_group *g = qif->qif_groups[i];
1676
1677		if (0 == (mask & (1 << i)))
1678			continue;
1679		if (g == NULL)
1680			continue;
1681
1682		log(LOG_DEBUG, "%s: %s [%2d] full_slots 0x%x\n",
1683		    if_name(QFQIF_IFP(qif)), qfq_style(qif), i,
1684		    g->qfg_full_slots);
1685		log(LOG_DEBUG, "%s: %s             S 0x%20llx F 0x%llx %c\n",
1686		    if_name(QFQIF_IFP(qif)), qfq_style(qif),
1687		    g->qfg_S, g->qfg_F, mask & (1 << i) ? '1' : '0');
1688
1689		for (j = 0; j < qif->qif_maxslots; j++) {
1690			if (g->qfg_slots[j]) {
1691				log(LOG_DEBUG, "%s: %s      bucket %d 0x%llx "
1692				    "qid %d\n", if_name(QFQIF_IFP(qif)),
1693				    qfq_style(qif), j,
1694				    (uint64_t)VM_KERNEL_ADDRPERM(
1695				    g->qfg_slots[j]),
1696				    g->qfg_slots[j]->cl_handle);
1697			}
1698		}
1699	}
1700}
1701
1702static void
1703qfq_dump_sched(struct qfq_if *qif, const char *msg)
1704{
1705	log(LOG_DEBUG, "%s: %s --- in %s: ---\n",
1706	    if_name(QFQIF_IFP(qif)), qfq_style(qif), msg);
1707	log(LOG_DEBUG, "%s: %s emptygrp %d queued %d V 0x%llx\n",
1708	    if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_emptygrp,
1709	    qif->qif_queued, qif->qif_V);
1710	log(LOG_DEBUG, "%s: %s      ER 0x%08x\n",
1711	    if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[ER]);
1712	log(LOG_DEBUG, "%s: %s      EB 0x%08x\n",
1713	    if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[EB]);
1714	log(LOG_DEBUG, "%s: %s      IR 0x%08x\n",
1715	    if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[IR]);
1716	log(LOG_DEBUG, "%s: %s      IB 0x%08x\n",
1717	    if_name(QFQIF_IFP(qif)), qfq_style(qif), qif->qif_bitmaps[IB]);
1718	qfq_dump_groups(qif, 0xffffffff);
1719};
1720#endif /* QFQ_DEBUG */
1721
1722/*
1723 * qfq_enqueue_ifclassq is an enqueue function to be registered to
1724 * (*ifcq_enqueue) in struct ifclassq.
1725 */
1726static int
1727qfq_enqueue_ifclassq(struct ifclassq *ifq, struct mbuf *m)
1728{
1729	u_int32_t i;
1730
1731	IFCQ_LOCK_ASSERT_HELD(ifq);
1732
1733	if (!(m->m_flags & M_PKTHDR)) {
1734		/* should not happen */
1735		log(LOG_ERR, "%s: packet does not have pkthdr\n",
1736		    if_name(ifq->ifcq_ifp));
1737		IFCQ_CONVERT_LOCK(ifq);
1738		m_freem(m);
1739		return (ENOBUFS);
1740	}
1741
1742	i = MBUF_SCIDX(mbuf_get_service_class(m));
1743	VERIFY((u_int32_t)i < IFCQ_SC_MAX);
1744
1745	return (qfq_enqueue(ifq->ifcq_disc,
1746	    ifq->ifcq_disc_slots[i].cl, m, m_pftag(m)));
1747}
1748
1749/*
1750 * qfq_dequeue_ifclassq is a dequeue function to be registered to
1751 * (*ifcq_dequeue) in struct ifclass.
1752 *
1753 * note: CLASSQDQ_POLL returns the next packet without removing the packet
1754 *	from the queue.  CLASSQDQ_REMOVE is a normal dequeue operation.
1755 *	CLASSQDQ_REMOVE must return the same packet if called immediately
1756 *	after CLASSQDQ_POLL.
1757 */
1758static struct mbuf *
1759qfq_dequeue_ifclassq(struct ifclassq *ifq, cqdq_op_t op)
1760{
1761	return (qfq_dequeue(ifq->ifcq_disc, op));
1762}
1763
1764static int
1765qfq_request_ifclassq(struct ifclassq *ifq, cqrq_t req, void *arg)
1766{
1767	struct qfq_if *qif = (struct qfq_if *)ifq->ifcq_disc;
1768	int err = 0;
1769
1770	IFCQ_LOCK_ASSERT_HELD(ifq);
1771
1772	switch (req) {
1773	case CLASSQRQ_PURGE:
1774		qfq_purge(qif);
1775		break;
1776
1777	case CLASSQRQ_PURGE_SC:
1778		qfq_purge_sc(qif, (cqrq_purge_sc_t *)arg);
1779		break;
1780
1781	case CLASSQRQ_EVENT:
1782		qfq_event(qif, (cqev_t)arg);
1783		break;
1784
1785	case CLASSQRQ_THROTTLE:
1786		err = qfq_throttle(qif, (cqrq_throttle_t *)arg);
1787		break;
1788	case CLASSQRQ_STAT_SC:
1789		err = qfq_stat_sc(qif, (cqrq_stat_sc_t *)arg);
1790		break;
1791	}
1792	return (err);
1793}
1794
1795int
1796qfq_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags)
1797{
1798	struct ifnet *ifp = ifq->ifcq_ifp;
1799	struct qfq_class *cl0, *cl1, *cl2, *cl3, *cl4;
1800	struct qfq_class *cl5, *cl6, *cl7, *cl8, *cl9;
1801	struct qfq_if *qif;
1802	u_int32_t maxlen = 0, qflags = 0;
1803	int err = 0;
1804
1805	IFCQ_LOCK_ASSERT_HELD(ifq);
1806	VERIFY(ifq->ifcq_disc == NULL);
1807	VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
1808
1809	if (flags & PKTSCHEDF_QALG_RED)
1810		qflags |= QFCF_RED;
1811	if (flags & PKTSCHEDF_QALG_RIO)
1812		qflags |= QFCF_RIO;
1813	if (flags & PKTSCHEDF_QALG_BLUE)
1814		qflags |= QFCF_BLUE;
1815	if (flags & PKTSCHEDF_QALG_SFB)
1816		qflags |= QFCF_SFB;
1817	if (flags & PKTSCHEDF_QALG_ECN)
1818		qflags |= QFCF_ECN;
1819	if (flags & PKTSCHEDF_QALG_FLOWCTL)
1820		qflags |= QFCF_FLOWCTL;
1821	if (flags & PKTSCHEDF_QALG_DELAYBASED)
1822		qflags |= QFCF_DELAYBASED;
1823
1824	qif = qfq_alloc(ifp, M_WAITOK, FALSE);
1825	if (qif == NULL)
1826		return (ENOMEM);
1827
1828	if ((maxlen = IFCQ_MAXLEN(ifq)) == 0)
1829		maxlen = if_sndq_maxlen;
1830
1831	if ((err = qfq_add_queue(qif, maxlen, 300, 1200,
1832	    qflags | QFCF_LAZY, SCIDX_BK_SYS, &cl0)) != 0)
1833		goto cleanup;
1834
1835	if ((err = qfq_add_queue(qif, maxlen, 600, 1400,
1836	    qflags | QFCF_LAZY, SCIDX_BK, &cl1)) != 0)
1837		goto cleanup;
1838
1839	if ((err = qfq_add_queue(qif, maxlen, 2400, 600,
1840	    qflags | QFCF_DEFAULTCLASS, SCIDX_BE, &cl2)) != 0)
1841		goto cleanup;
1842
1843	if ((err = qfq_add_queue(qif, maxlen, 2700, 600,
1844	    qflags | QFCF_LAZY, SCIDX_RD, &cl3)) != 0)
1845		goto cleanup;
1846
1847	if ((err = qfq_add_queue(qif, maxlen, 3000, 400,
1848	    qflags | QFCF_LAZY, SCIDX_OAM, &cl4)) != 0)
1849		goto cleanup;
1850
1851	if ((err = qfq_add_queue(qif, maxlen, 8000, 1000,
1852	    qflags | QFCF_LAZY, SCIDX_AV, &cl5)) != 0)
1853		goto cleanup;
1854
1855	if ((err = qfq_add_queue(qif, maxlen, 15000, 1200,
1856	    qflags | QFCF_LAZY, SCIDX_RV, &cl6)) != 0)
1857		goto cleanup;
1858
1859	if ((err = qfq_add_queue(qif, maxlen, 20000, 1400,
1860	    qflags | QFCF_LAZY, SCIDX_VI, &cl7)) != 0)
1861		goto cleanup;
1862
1863	if ((err = qfq_add_queue(qif, maxlen, 23000, 200,
1864	    qflags | QFCF_LAZY, SCIDX_VO, &cl8)) != 0)
1865		goto cleanup;
1866
1867	if ((err = qfq_add_queue(qif, maxlen, 25000, 200,
1868	    qflags, SCIDX_CTL, &cl9)) != 0)
1869		goto cleanup;
1870
1871	err = ifclassq_attach(ifq, PKTSCHEDT_QFQ, qif,
1872	    qfq_enqueue_ifclassq, qfq_dequeue_ifclassq, NULL,
1873	    qfq_request_ifclassq);
1874
1875	/* cache these for faster lookup */
1876	if (err == 0) {
1877		ifq->ifcq_disc_slots[SCIDX_BK_SYS].qid = SCIDX_BK_SYS;
1878		ifq->ifcq_disc_slots[SCIDX_BK_SYS].cl = cl0;
1879
1880		ifq->ifcq_disc_slots[SCIDX_BK].qid = SCIDX_BK;
1881		ifq->ifcq_disc_slots[SCIDX_BK].cl = cl1;
1882
1883		ifq->ifcq_disc_slots[SCIDX_BE].qid = SCIDX_BE;
1884		ifq->ifcq_disc_slots[SCIDX_BE].cl = cl2;
1885
1886		ifq->ifcq_disc_slots[SCIDX_RD].qid = SCIDX_RD;
1887		ifq->ifcq_disc_slots[SCIDX_RD].cl = cl3;
1888
1889		ifq->ifcq_disc_slots[SCIDX_OAM].qid = SCIDX_OAM;
1890		ifq->ifcq_disc_slots[SCIDX_OAM].cl = cl4;
1891
1892		ifq->ifcq_disc_slots[SCIDX_AV].qid = SCIDX_AV;
1893		ifq->ifcq_disc_slots[SCIDX_AV].cl = cl5;
1894
1895		ifq->ifcq_disc_slots[SCIDX_RV].qid = SCIDX_RV;
1896		ifq->ifcq_disc_slots[SCIDX_RV].cl = cl6;
1897
1898		ifq->ifcq_disc_slots[SCIDX_VI].qid = SCIDX_VI;
1899		ifq->ifcq_disc_slots[SCIDX_VI].cl = cl7;
1900
1901		ifq->ifcq_disc_slots[SCIDX_VO].qid = SCIDX_VO;
1902		ifq->ifcq_disc_slots[SCIDX_VO].cl = cl8;
1903
1904		ifq->ifcq_disc_slots[SCIDX_CTL].qid = SCIDX_CTL;
1905		ifq->ifcq_disc_slots[SCIDX_CTL].cl = cl9;
1906	}
1907
1908cleanup:
1909	if (err != 0)
1910		(void) qfq_destroy_locked(qif);
1911
1912	return (err);
1913}
1914
1915int
1916qfq_teardown_ifclassq(struct ifclassq *ifq)
1917{
1918	struct qfq_if *qif = ifq->ifcq_disc;
1919	int i;
1920
1921	IFCQ_LOCK_ASSERT_HELD(ifq);
1922	VERIFY(qif != NULL && ifq->ifcq_type == PKTSCHEDT_QFQ);
1923
1924	(void) qfq_destroy_locked(qif);
1925
1926	ifq->ifcq_disc = NULL;
1927	for (i = 0; i < IFCQ_SC_MAX; i++) {
1928		ifq->ifcq_disc_slots[i].qid = 0;
1929		ifq->ifcq_disc_slots[i].cl = NULL;
1930	}
1931
1932	return (ifclassq_detach(ifq));
1933}
1934
1935int
1936qfq_getqstats_ifclassq(struct ifclassq *ifq, u_int32_t slot,
1937    struct if_ifclassq_stats *ifqs)
1938{
1939	struct qfq_if *qif = ifq->ifcq_disc;
1940
1941	IFCQ_LOCK_ASSERT_HELD(ifq);
1942	VERIFY(ifq->ifcq_type == PKTSCHEDT_QFQ);
1943
1944	if (slot >= IFCQ_SC_MAX)
1945		return (EINVAL);
1946
1947	return (qfq_get_class_stats(qif, ifq->ifcq_disc_slots[slot].qid,
1948	    &ifqs->ifqs_qfq_stats));
1949}
1950
1951static int
1952qfq_throttle(struct qfq_if *qif, cqrq_throttle_t *tr)
1953{
1954	struct ifclassq *ifq = qif->qif_ifq;
1955	struct qfq_class *cl;
1956	int err = 0;
1957
1958	IFCQ_LOCK_ASSERT_HELD(ifq);
1959	VERIFY(!(qif->qif_flags & QFQIFF_ALTQ));
1960
1961	if (!tr->set) {
1962		tr->level = qif->qif_throttle;
1963		return (0);
1964	}
1965
1966	if (tr->level == qif->qif_throttle)
1967		return (EALREADY);
1968
1969	/* Current throttling levels only involve BK_SYS class */
1970	cl = ifq->ifcq_disc_slots[SCIDX_BK_SYS].cl;
1971
1972	switch (tr->level) {
1973	case IFNET_THROTTLE_OFF:
1974		err = qfq_resumeq(qif, cl);
1975		break;
1976
1977	case IFNET_THROTTLE_OPPORTUNISTIC:
1978		err = qfq_suspendq(qif, cl);
1979		break;
1980
1981	default:
1982		VERIFY(0);
1983		/* NOTREACHED */
1984	}
1985
1986	if (err == 0 || err == ENXIO) {
1987		if (pktsched_verbose) {
1988			log(LOG_DEBUG, "%s: %s throttling level %sset %d->%d\n",
1989			    if_name(QFQIF_IFP(qif)), qfq_style(qif),
1990			    (err == 0) ? "" : "lazy ", qif->qif_throttle,
1991			    tr->level);
1992		}
1993		qif->qif_throttle = tr->level;
1994		if (err != 0)
1995			err = 0;
1996		else
1997			qfq_purgeq(qif, cl, 0, NULL, NULL);
1998	} else {
1999		log(LOG_ERR, "%s: %s unable to set throttling level "
2000		    "%d->%d [error=%d]\n", if_name(QFQIF_IFP(qif)),
2001		    qfq_style(qif), qif->qif_throttle, tr->level, err);
2002	}
2003
2004	return (err);
2005}
2006
2007static int
2008qfq_resumeq(struct qfq_if *qif, struct qfq_class *cl)
2009{
2010	struct ifclassq *ifq = qif->qif_ifq;
2011	int err = 0;
2012
2013	IFCQ_LOCK_ASSERT_HELD(ifq);
2014
2015#if CLASSQ_RIO
2016	if (q_is_rio(&cl->cl_q))
2017		err = rio_suspendq(cl->cl_rio, &cl->cl_q, FALSE);
2018	else
2019#endif /* CLASSQ_RIO */
2020#if CLASSQ_RED
2021	if (q_is_red(&cl->cl_q))
2022		err = red_suspendq(cl->cl_red, &cl->cl_q, FALSE);
2023	else
2024#endif /* CLASSQ_RED */
2025#if CLASSQ_BLUE
2026	if (q_is_blue(&cl->cl_q))
2027		err = blue_suspendq(cl->cl_blue, &cl->cl_q, FALSE);
2028	else
2029#endif /* CLASSQ_BLUE */
2030	if (q_is_sfb(&cl->cl_q) && cl->cl_sfb != NULL)
2031		err = sfb_suspendq(cl->cl_sfb, &cl->cl_q, FALSE);
2032
2033	if (err == 0)
2034		qstate(&cl->cl_q) = QS_RUNNING;
2035
2036	return (err);
2037}
2038
2039static int
2040qfq_suspendq(struct qfq_if *qif, struct qfq_class *cl)
2041{
2042	struct ifclassq *ifq = qif->qif_ifq;
2043	int err = 0;
2044
2045	IFCQ_LOCK_ASSERT_HELD(ifq);
2046
2047#if CLASSQ_RIO
2048	if (q_is_rio(&cl->cl_q))
2049		err = rio_suspendq(cl->cl_rio, &cl->cl_q, TRUE);
2050	else
2051#endif /* CLASSQ_RIO */
2052#if CLASSQ_RED
2053	if (q_is_red(&cl->cl_q))
2054		err = red_suspendq(cl->cl_red, &cl->cl_q, TRUE);
2055	else
2056#endif /* CLASSQ_RED */
2057#if CLASSQ_BLUE
2058	if (q_is_blue(&cl->cl_q))
2059		err = blue_suspendq(cl->cl_blue, &cl->cl_q, TRUE);
2060	else
2061#endif /* CLASSQ_BLUE */
2062	if (q_is_sfb(&cl->cl_q)) {
2063		if (cl->cl_sfb != NULL) {
2064			err = sfb_suspendq(cl->cl_sfb, &cl->cl_q, TRUE);
2065		} else {
2066			VERIFY(cl->cl_flags & QFCF_LAZY);
2067			err = ENXIO;	/* delayed throttling */
2068		}
2069	}
2070
2071	if (err == 0 || err == ENXIO)
2072		qstate(&cl->cl_q) = QS_SUSPENDED;
2073
2074	return (err);
2075}
2076