1/*
2 * Copyright (c) 2007-2012 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*	$OpenBSD: altq_rmclass.c,v 1.13 2007/09/13 20:40:02 chl Exp $	*/
30/*	$KAME: altq_rmclass.c,v 1.10 2001/02/09 07:20:40 kjc Exp $	*/
31
32/*
33 * Copyright (c) 1991-1997 Regents of the University of California.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 *    notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 *    notice, this list of conditions and the following disclaimer in the
43 *    documentation and/or other materials provided with the distribution.
44 * 3. All advertising materials mentioning features or use of this software
45 *    must display the following acknowledgement:
46 *      This product includes software developed by the Network Research
47 *      Group at Lawrence Berkeley Laboratory.
48 * 4. Neither the name of the University nor of the Laboratory may be used
49 *    to endorse or promote products derived from this software without
50 *    specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * LBL code modified by speer@eng.sun.com, May 1977.
65 * For questions and/or comments, please send mail to cbq@ee.lbl.gov
66 */
67
68#include <sys/cdefs.h>
69
70#ident "@(#)rm_class.c  1.48     97/12/05 SMI"
71
72#if PKTSCHED_CBQ
73
74#include <sys/param.h>
75#include <sys/malloc.h>
76#include <sys/mbuf.h>
77#include <sys/socket.h>
78#include <sys/systm.h>
79#include <sys/errno.h>
80#include <sys/time.h>
81#include <sys/kernel.h>
82#include <sys/kernel_types.h>
83#include <sys/syslog.h>
84
85#include <kern/zalloc.h>
86
87#include <net/if.h>
88#include <net/net_osdep.h>
89#include <net/pktsched/pktsched.h>
90#include <net/pktsched/pktsched_rmclass.h>
91#include <net/pktsched/pktsched_rmclass_debug.h>
92#include <net/classq/classq_red.h>
93#include <net/classq/classq_rio.h>
94#include <net/classq/classq_blue.h>
95#include <net/classq/classq_sfb.h>
96
97/*
98 * Local Macros
99 */
100
101#define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
102
103/*
104 * Local routines.
105 */
106
107static int	rmc_satisfied(struct rm_class *, struct timeval *);
108static void	rmc_wrr_set_weights(struct rm_ifdat *);
109static void	rmc_depth_compute(struct rm_class *);
110static void	rmc_depth_recompute(rm_class_t *);
111
112static struct mbuf *_rmc_wrr_dequeue_next(struct rm_ifdat *, cqdq_op_t);
113static struct mbuf *_rmc_prr_dequeue_next(struct rm_ifdat *, cqdq_op_t);
114
115static int	_rmc_addq(rm_class_t *, struct mbuf *, struct pf_mtag *);
116static void	_rmc_dropq(rm_class_t *);
117static struct mbuf *_rmc_getq(rm_class_t *);
118static struct mbuf *_rmc_pollq(rm_class_t *);
119
120static int	rmc_under_limit(struct rm_class *, struct timeval *);
121static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
122static void	rmc_drop_action(struct rm_class *);
123static void	rmc_restart(struct rm_class *);
124static void	rmc_root_overlimit(rm_class_t *, rm_class_t *);
125
126#define	RMC_ZONE_MAX	32		/* maximum elements in zone */
127#define	RMC_ZONE_NAME	"pktsched_cbq_cl" /* zone name (CBQ for now) */
128
129static unsigned int rmc_size;		/* size of zone element */
130static struct zone *rmc_zone;		/* zone for rm_class */
131
132void
133rmclass_init(void)
134{
135	if (rmc_zone != NULL)
136		return;
137
138	rmc_size = sizeof (struct rm_class);
139	rmc_zone = zinit(rmc_size, RMC_ZONE_MAX * rmc_size, 0, RMC_ZONE_NAME);
140	if (rmc_zone == NULL) {
141		panic("%s: failed allocating %s", __func__, RMC_ZONE_NAME);
142		/* NOTREACHED */
143	}
144	zone_change(rmc_zone, Z_EXPAND, TRUE);
145	zone_change(rmc_zone, Z_CALLERACCT, TRUE);
146}
147
148#define	BORROW_OFFTIME
149/*
150 * BORROW_OFFTIME (experimental):
151 * borrow the offtime of the class borrowing from.
152 * the reason is that when its own offtime is set, the class is unable
153 * to borrow much, especially when cutoff is taking effect.
154 * but when the borrowed class is overloaded (advidle is close to minidle),
155 * use the borrowing class's offtime to avoid overload.
156 */
157#define	ADJUST_CUTOFF
158/*
159 * ADJUST_CUTOFF (experimental):
160 * if no underlimit class is found due to cutoff, increase cutoff and
161 * retry the scheduling loop.
162 * also, don't invoke delay_actions while cutoff is taking effect,
163 * since a sleeping class won't have a chance to be scheduled in the
164 * next loop.
165 *
166 * now heuristics for setting the top-level variable (cutoff_) becomes:
167 *	1. if a packet arrives for a not-overlimit class, set cutoff
168 *	   to the depth of the class.
169 *	2. if cutoff is i, and a packet arrives for an overlimit class
170 *	   with an underlimit ancestor at a lower level than i (say j),
171 *	   then set cutoff to j.
172 *	3. at scheduling a packet, if there is no underlimit class
173 *	   due to the current cutoff level, increase cutoff by 1 and
174 *	   then try to schedule again.
175 */
176
177/*
178 * rm_class_t *
179 * rmc_newclass(...) - Create a new resource management class at priority
180 * 'pri' on the interface given by 'ifd'.
181 *
182 * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
183 *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
184 *              than 100% of the bandwidth, this number should be the
185 *              'effective' rate for the class.  Let f be the
186 *              bandwidth fraction allocated to this class, and let
187 *              nsPerByte be the data rate of the output link in
188 *              nanoseconds/byte.  Then nsecPerByte is set to
189 *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
190 *              for a class that gets 50% of an ethernet's bandwidth.
191 *
192 * action       the routine to call when the class is over limit.
193 *
194 * maxq         max allowable queue size for class (in packets).
195 *
196 * parent       parent class pointer.
197 *
198 * borrow       class to borrow from (should be either 'parent' or null).
199 *
200 * maxidle      max value allowed for class 'idle' time estimate (this
201 *              parameter determines how large an initial burst of packets
202 *              can be before overlimit action is invoked.
203 *
204 * offtime      how long 'delay' action will delay when class goes over
205 *              limit (this parameter determines the steady-state burst
206 *              size when a class is running over its limit).
207 *
208 * Maxidle and offtime have to be computed from the following:  If the
209 * average packet size is s, the bandwidth fraction allocated to this
210 * class is f, we want to allow b packet bursts, and the gain of the
211 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
212 *
213 *   ptime = s * nsPerByte * (1 - f) / f
214 *   maxidle = ptime * (1 - g^b) / g^b
215 *   minidle = -ptime * (1 / (f - 1))
216 *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
217 *
218 * Operationally, it's convenient to specify maxidle & offtime in units
219 * independent of the link bandwidth so the maxidle & offtime passed to
220 * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
221 * (The constant factor is a scale factor needed to make the parameters
222 * integers.  This scaling also means that the 'unscaled' values of
223 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
224 * not nanoseconds.)  Also note that the 'idle' filter computation keeps
225 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
226 * maxidle also must be scaled upward by this value.  Thus, the passed
227 * values for maxidle and offtime can be computed as follows:
228 *
229 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
230 * offtime = offtime * 8 / (1000 * nsecPerByte)
231 *
232 * When USE_HRTIME is employed, then maxidle and offtime become:
233 * 	maxidle = maxilde * (8.0 / nsecPerByte);
234 * 	offtime = offtime * (8.0 / nsecPerByte);
235 */
236struct rm_class *
237rmc_newclass(int pri, struct rm_ifdat *ifd, u_int32_t nsecPerByte,
238    void (*action)(rm_class_t *, rm_class_t *), u_int32_t qid, u_int32_t maxq,
239    struct rm_class *parent, struct rm_class *borrow, u_int32_t maxidle,
240    int minidle, u_int32_t offtime, int pktsize, int flags)
241{
242	struct ifnet *ifp;
243	struct ifclassq *ifq;
244	struct rm_class	*cl;
245	struct rm_class	*peer;
246
247	if (nsecPerByte == 0) {
248		log(LOG_ERR, "%s: invalid inverse data rate\n", __func__);
249		return (NULL);
250	}
251
252	if (pri >= RM_MAXPRIO) {
253		log(LOG_ERR, "%s: priority %d out of range! (max %d)\n",
254		    __func__, pri, RM_MAXPRIO - 1);
255		return (NULL);
256	}
257
258#if !CLASSQ_RED
259	if (flags & RMCF_RED) {
260		log(LOG_ERR, "%s: RED not configured for CBQ!\n", __func__);
261		return (NULL);
262	}
263#endif /* !CLASSQ_RED */
264
265#if !CLASSQ_RIO
266	if (flags & RMCF_RIO) {
267		log(LOG_ERR, "%s: RIO not configured for CBQ!\n", __func__);
268		return (NULL);
269	}
270#endif /* CLASSQ_RIO */
271
272#if !CLASSQ_BLUE
273	if (flags & RMCF_BLUE) {
274		log(LOG_ERR, "%s: BLUE not configured for CBQ!\n", __func__);
275		return (NULL);
276	}
277#endif /* CLASSQ_BLUE */
278
279	/* These are mutually exclusive */
280	if ((flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) &&
281	    (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_RED &&
282	    (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_RIO &&
283	    (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_BLUE &&
284	    (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) != RMCF_SFB) {
285		log(LOG_ERR, "%s: RED|RIO|BLUE|SFB mutually exclusive\n",
286		    __func__);
287		return (NULL);
288	}
289
290	cl = zalloc(rmc_zone);
291	if (cl == NULL)
292		return (NULL);
293
294	bzero(cl, rmc_size);
295	CALLOUT_INIT(&cl->callout_);
296
297	/*
298	 * Class initialization.
299	 */
300	cl->children_ = NULL;
301	cl->parent_ = parent;
302	cl->borrow_ = borrow;
303	cl->leaf_ = 1;
304	cl->ifdat_ = ifd;
305	cl->pri_ = pri;
306	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
307	cl->depth_ = 0;
308	cl->qthresh_ = 0;
309	cl->ns_per_byte_ = nsecPerByte;
310
311	ifq = ifd->ifq_;
312	ifp = ifq->ifcq_ifp;
313
314	if (maxq == 0 || maxq > IFCQ_MAXLEN(ifq)) {
315		maxq = IFCQ_MAXLEN(ifq);
316		if (maxq == 0)
317			maxq = DEFAULT_QLIMIT;	/* use default */
318	}
319	_qinit(&cl->q_, Q_DROPHEAD, maxq);
320
321	cl->flags_ = flags;
322
323	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
324	if (cl->minidle_ > 0)
325		cl->minidle_ = 0;
326
327	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
328	if (cl->maxidle_ == 0)
329		cl->maxidle_ = 1;
330
331	cl->avgidle_ = cl->maxidle_;
332	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
333	if (cl->offtime_ == 0)
334		cl->offtime_ = 1;
335
336	cl->overlimit = action;
337
338	if (flags & (RMCF_RED|RMCF_RIO|RMCF_BLUE|RMCF_SFB)) {
339		int pkttime;
340
341		cl->qflags_ = 0;
342		if (flags & RMCF_ECN) {
343			if (flags & RMCF_BLUE)
344				cl->qflags_ |= BLUEF_ECN;
345			else if (flags & RMCF_SFB)
346				cl->qflags_ |= SFBF_ECN;
347			else if (flags & RMCF_RED)
348				cl->qflags_ |= REDF_ECN;
349			else if (flags & RMCF_RIO)
350				cl->qflags_ |= RIOF_ECN;
351		}
352		if (flags & RMCF_FLOWCTL) {
353			if (flags & RMCF_SFB)
354				cl->qflags_ |= SFBF_FLOWCTL;
355		}
356		if (flags & RMCF_FLOWVALVE) {
357			if (flags & RMCF_RED)
358				cl->qflags_ |= REDF_FLOWVALVE;
359		}
360		if (flags & RMCF_CLEARDSCP) {
361			if (flags & RMCF_RIO)
362				cl->qflags_ |= RIOF_CLEARDSCP;
363		}
364		pkttime = nsecPerByte * pktsize  / 1000;
365
366		/* Test for exclusivity {RED,RIO,BLUE,SFB} was done above */
367#if CLASSQ_RED
368		if (flags & RMCF_RED) {
369			cl->red_ = red_alloc(ifp, 0, 0,
370			    qlimit(&cl->q_) * 10/100,
371			    qlimit(&cl->q_) * 30/100,
372			    cl->qflags_, pkttime);
373			if (cl->red_ != NULL)
374				qtype(&cl->q_) = Q_RED;
375		}
376#endif /* CLASSQ_RED */
377#if CLASSQ_RIO
378		if (flags & RMCF_RIO) {
379			cl->rio_ =
380			    rio_alloc(ifp, 0, NULL, cl->qflags_, pkttime);
381			if (cl->rio_ != NULL)
382				qtype(&cl->q_) = Q_RIO;
383		}
384#endif /* CLASSQ_RIO */
385#if CLASSQ_BLUE
386		if (flags & RMCF_BLUE) {
387			cl->blue_ = blue_alloc(ifp, 0, 0, cl->qflags_);
388			if (cl->blue_ != NULL)
389				qtype(&cl->q_) = Q_BLUE;
390		}
391#endif /* CLASSQ_BLUE */
392		if (flags & RMCF_SFB) {
393			if (!(cl->flags_ & RMCF_LAZY))
394				cl->sfb_ = sfb_alloc(ifp, qid,
395				    qlimit(&cl->q_), cl->qflags_);
396			if (cl->sfb_ != NULL || (cl->flags_ & RMCF_LAZY))
397				qtype(&cl->q_) = Q_SFB;
398		}
399	}
400
401	/*
402	 * put the class into the class tree
403	 */
404	if ((peer = ifd->active_[pri]) != NULL) {
405		/* find the last class at this pri */
406		cl->peer_ = peer;
407		while (peer->peer_ != ifd->active_[pri])
408			peer = peer->peer_;
409		peer->peer_ = cl;
410	} else {
411		ifd->active_[pri] = cl;
412		cl->peer_ = cl;
413	}
414
415	if (cl->parent_) {
416		cl->next_ = parent->children_;
417		parent->children_ = cl;
418		parent->leaf_ = 0;
419	}
420
421	/*
422	 * Compute the depth of this class and its ancestors in the class
423	 * hierarchy.
424	 */
425	rmc_depth_compute(cl);
426
427	/*
428	 * If CBQ's WRR is enabled, then initialize the class WRR state.
429	 */
430	if (ifd->wrr_) {
431		ifd->num_[pri]++;
432		ifd->alloc_[pri] += cl->allotment_;
433		rmc_wrr_set_weights(ifd);
434	}
435	return (cl);
436}
437
438int
439rmc_modclass(struct rm_class *cl, u_int32_t nsecPerByte, int maxq,
440    u_int32_t maxidle, int minidle, u_int32_t offtime, int pktsize)
441{
442#pragma unused(pktsize)
443	struct rm_ifdat	*ifd;
444	u_int32_t	 old_allotment;
445
446	ifd = cl->ifdat_;
447	old_allotment = cl->allotment_;
448
449	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
450	cl->qthresh_ = 0;
451	cl->ns_per_byte_ = nsecPerByte;
452
453	qlimit(&cl->q_) = maxq;
454
455	cl->minidle_ = (minidle * nsecPerByte) / 8;
456	if (cl->minidle_ > 0)
457		cl->minidle_ = 0;
458
459	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
460	if (cl->maxidle_ == 0)
461		cl->maxidle_ = 1;
462
463	cl->avgidle_ = cl->maxidle_;
464	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
465	if (cl->offtime_ == 0)
466		cl->offtime_ = 1;
467
468	/*
469	 * If CBQ's WRR is enabled, then initialize the class WRR state.
470	 */
471	if (ifd->wrr_) {
472		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
473		rmc_wrr_set_weights(ifd);
474	}
475	return (0);
476}
477
478/*
479 * static void
480 * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
481 *	the appropriate run robin weights for the CBQ weighted round robin
482 *	algorithm.
483 *
484 *	Returns: NONE
485 */
486
487static void
488rmc_wrr_set_weights(struct rm_ifdat *ifd)
489{
490	int		i;
491	struct rm_class	*cl, *clh;
492
493	for (i = 0; i < RM_MAXPRIO; i++) {
494		/*
495		 * This is inverted from that of the simulator to
496		 * maintain precision.
497		 */
498		if (ifd->num_[i] == 0) {
499			ifd->M_[i] = 0;
500		} else {
501			ifd->M_[i] =
502			    ifd->alloc_[i] / (ifd->num_[i] * ifd->maxpkt_);
503		}
504		/*
505		 * Compute the weighted allotment for each class.
506		 * This takes the expensive div instruction out
507		 * of the main loop for the wrr scheduling path.
508		 * These only get recomputed when a class comes or
509		 * goes.
510		 */
511		if (ifd->active_[i] != NULL) {
512			clh = cl = ifd->active_[i];
513			do {
514				/* safe-guard for slow link or alloc_ == 0 */
515				if (ifd->M_[i] == 0) {
516					cl->w_allotment_ = 0;
517				} else {
518					cl->w_allotment_ =
519					    cl->allotment_ / ifd->M_[i];
520				}
521				cl = cl->peer_;
522			} while ((cl != NULL) && (cl != clh));
523		}
524	}
525}
526
527int
528rmc_get_weight(struct rm_ifdat *ifd, int pri)
529{
530	if ((pri >= 0) && (pri < RM_MAXPRIO))
531		return (ifd->M_[pri]);
532	else
533		return (0);
534}
535
536/*
537 * static void
538 * rmc_depth_compute(struct rm_class *cl) - This function computes the
539 *	appropriate depth of class 'cl' and its ancestors.
540 *
541 *	Returns:	NONE
542 */
543
544static void
545rmc_depth_compute(struct rm_class *cl)
546{
547	rm_class_t	*t = cl, *p;
548
549	/*
550	 * Recompute the depth for the branch of the tree.
551	 */
552	while (t != NULL) {
553		p = t->parent_;
554		if (p && (t->depth_ >= p->depth_)) {
555			p->depth_ = t->depth_ + 1;
556			t = p;
557		} else
558			t = NULL;
559	}
560}
561
562/*
563 * static void
564 * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
565 *	the depth of the tree after a class has been deleted.
566 *
567 *	Returns:	NONE
568 */
569
570static void
571rmc_depth_recompute(rm_class_t *cl)
572{
573	rm_class_t	*p, *t;
574
575	p = cl;
576	while (p != NULL) {
577		if ((t = p->children_) == NULL) {
578			p->depth_ = 0;
579		} else {
580			int cdepth = 0;
581
582			while (t != NULL) {
583				if (t->depth_ > cdepth)
584					cdepth = t->depth_;
585				t = t->next_;
586			}
587
588			if (p->depth_ == cdepth + 1)
589				/* no change to this parent */
590				return;
591
592			p->depth_ = cdepth + 1;
593		}
594
595		p = p->parent_;
596	}
597}
598
599/*
600 * void
601 * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
602 *	function deletes a class from the link-sharing structure and frees
603 *	all resources associated with the class.
604 *
605 *	Returns: NONE
606 */
607
608void
609rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
610{
611	struct rm_class	*p, *head, *previous;
612
613	VERIFY(cl->children_ == NULL);
614
615	if (cl->sleeping_)
616		CALLOUT_STOP(&cl->callout_);
617
618	/*
619	 * Free packets in the packet queue.
620	 * XXX - this may not be a desired behavior.  Packets should be
621	 *		re-queued.
622	 */
623	rmc_dropall(cl);
624
625	/*
626	 * If the class has a parent, then remove the class from the
627	 * class from the parent's children chain.
628	 */
629	if (cl->parent_ != NULL) {
630		head = cl->parent_->children_;
631		p = previous = head;
632		if (head->next_ == NULL) {
633			VERIFY(head == cl);
634			cl->parent_->children_ = NULL;
635			cl->parent_->leaf_ = 1;
636		} else while (p != NULL) {
637			if (p == cl) {
638				if (cl == head)
639					cl->parent_->children_ = cl->next_;
640				else
641					previous->next_ = cl->next_;
642				cl->next_ = NULL;
643				p = NULL;
644			} else {
645				previous = p;
646				p = p->next_;
647			}
648		}
649	}
650
651	/*
652	 * Delete class from class priority peer list.
653	 */
654	if ((p = ifd->active_[cl->pri_]) != NULL) {
655		/*
656		 * If there is more than one member of this priority
657		 * level, then look for class(cl) in the priority level.
658		 */
659		if (p != p->peer_) {
660			while (p->peer_ != cl)
661				p = p->peer_;
662			p->peer_ = cl->peer_;
663
664			if (ifd->active_[cl->pri_] == cl)
665				ifd->active_[cl->pri_] = cl->peer_;
666		} else {
667			VERIFY(p == cl);
668			ifd->active_[cl->pri_] = NULL;
669		}
670	}
671
672	/*
673	 * Recompute the WRR weights.
674	 */
675	if (ifd->wrr_) {
676		ifd->alloc_[cl->pri_] -= cl->allotment_;
677		ifd->num_[cl->pri_]--;
678		rmc_wrr_set_weights(ifd);
679	}
680
681	/*
682	 * Re-compute the depth of the tree.
683	 */
684	rmc_depth_recompute(cl->parent_);
685
686	/*
687	 * Free the class structure.
688	 */
689	if (cl->qalg_.ptr != NULL) {
690#if CLASSQ_RIO
691		if (q_is_rio(&cl->q_))
692			rio_destroy(cl->rio_);
693#endif /* CLASSQ_RIO */
694#if CLASSQ_RED
695		if (q_is_red(&cl->q_))
696			red_destroy(cl->red_);
697#endif /* CLASSQ_RED */
698#if CLASSQ_BLUE
699		if (q_is_blue(&cl->q_))
700			blue_destroy(cl->blue_);
701#endif /* CLASSQ_BLUE */
702		if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL)
703			sfb_destroy(cl->sfb_);
704		cl->qalg_.ptr = NULL;
705		qtype(&cl->q_) = Q_DROPTAIL;
706		qstate(&cl->q_) = QS_RUNNING;
707	}
708	zfree(rmc_zone, cl);
709}
710
711
712/*
713 * int
714 * rmc_init(...) - Initialize the resource management data structures
715 *	associated with the output portion of interface 'ifp'.  'ifd' is
716 *	where the structures will be built (for backwards compatibility, the
717 *	structures aren't kept in the ifnet struct).  'nsecPerByte'
718 *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
719 *	'restart' is the driver-specific routine that the generic 'delay
720 *	until under limit' action will call to restart output.  `maxq'
721 *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
722 *	is the maximum number of packets that the resource management
723 *	code will allow to be queued 'downstream' (this is typically 1).
724 *
725 *	Returns:	0 on success
726 */
727
728int
729rmc_init(struct ifclassq *ifq, struct rm_ifdat *ifd, u_int32_t nsecPerByte,
730    void (*restart)(struct ifclassq *), u_int32_t qid, int maxq, int maxqueued,
731    u_int32_t maxidle, int minidle, u_int32_t offtime, int flags)
732{
733	struct ifnet *ifp = ifq->ifcq_ifp;
734	int i, mtu;
735
736	/*
737	 * Initialize the CBQ tracing/debug facility.
738	 */
739	CBQTRACEINIT();
740
741	if (nsecPerByte == 0) {
742		log(LOG_ERR, "%s: %s: invalid inverse data rate)\n",
743		    __func__, if_name(ifp));
744		return (EINVAL);
745	}
746
747	mtu = ifp->if_mtu;
748	if (mtu < 1) {
749		log(LOG_ERR, "%s: %s: invalid MTU (interface not "
750		    "initialized?)\n", __func__, if_name(ifp));
751		return (EINVAL);
752	}
753	bzero((char *)ifd, sizeof (*ifd));
754
755	ifd->ifq_ = ifq;
756	ifd->restart = restart;
757	ifd->maxqueued_ = maxqueued;
758	ifd->ns_per_byte_ = nsecPerByte;
759	ifd->maxpkt_ = mtu;
760	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
761	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
762#if 1
763	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
764	if (mtu * nsecPerByte > 10 * 1000000)
765		ifd->maxiftime_ /= 4;
766#endif
767
768	reset_cutoff(ifd);
769	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
770
771	/*
772	 * Initialize the CBQ's WRR state.
773	 */
774	for (i = 0; i < RM_MAXPRIO; i++) {
775		ifd->alloc_[i] = 0;
776		ifd->M_[i] = 0;
777		ifd->num_[i] = 0;
778		ifd->na_[i] = 0;
779		ifd->active_[i] = NULL;
780	}
781
782	/*
783	 * Initialize current packet state.
784	 */
785	ifd->qi_ = 0;
786	ifd->qo_ = 0;
787	for (i = 0; i < RM_MAXQUEUED; i++) {
788		ifd->class_[i] = NULL;
789		ifd->curlen_[i] = 0;
790		ifd->borrowed_[i] = NULL;
791	}
792
793	/*
794	 * Create the root class of the link-sharing structure.
795	 */
796	if ((ifd->root_ = rmc_newclass(0, ifd, nsecPerByte,
797	    rmc_root_overlimit, qid, maxq, 0, 0, maxidle, minidle, offtime,
798	    0, 0)) == NULL) {
799		log(LOG_ERR, "rmc_init: root class not allocated\n");
800		return (ENOMEM);
801	}
802	ifd->root_->depth_ = 0;
803
804	return (0);
805}
806
807/*
808 * void
809 * rmc_queue_packet(struct rm_class *cl, struct mbuf *m) - Add packet given by
810 *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
811 *	by a driver's if_output routine.  This routine must be called with
812 *	output packet completion interrupts locked out (to avoid racing with
813 *	rmc_dequeue_next).
814 *
815 *	Returns:	0 on successful queueing
816 *			CLASSQEQ_DROPPED when packet drop occurs
817 */
818int
819rmc_queue_packet(struct rm_class *cl, struct mbuf *m, struct pf_mtag *t)
820{
821	struct timeval	 now;
822	struct rm_ifdat *ifd = cl->ifdat_;
823	int		 cpri = cl->pri_;
824	int		 is_empty = qempty(&cl->q_);
825	int ret	= 0;
826
827	RM_GETTIME(now);
828	if (ifd->cutoff_ > 0) {
829		if (TV_LT(&cl->undertime_, &now)) {
830			if (ifd->cutoff_ > cl->depth_)
831				ifd->cutoff_ = cl->depth_;
832			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
833		} else {
834			/*
835			 * the class is overlimit. if the class has
836			 * underlimit ancestors, set cutoff to the lowest
837			 * depth among them.
838			 */
839			struct rm_class *borrow = cl->borrow_;
840
841			while (borrow != NULL &&
842			    borrow->depth_ < ifd->cutoff_) {
843				if (TV_LT(&borrow->undertime_, &now)) {
844					ifd->cutoff_ = borrow->depth_;
845					CBQTRACE(rmc_queue_packet, 'ffob',
846					    ifd->cutoff_);
847					break;
848				}
849				borrow = borrow->borrow_;
850			}
851		}
852	}
853
854	ret = _rmc_addq(cl, m, t);
855	if (ret != 0 &&
856	    (ret == CLASSQEQ_DROPPED || ret == CLASSQEQ_DROPPED_FC ||
857	    ret == CLASSQEQ_DROPPED_SP)) {
858		/* failed */
859		return (ret);
860	}
861	VERIFY(ret == 0 || ret == CLASSQEQ_SUCCESS_FC);
862	if (is_empty) {
863		CBQTRACE(rmc_queue_packet, 'type', cl->stats_.handle);
864		ifd->na_[cpri]++;
865	}
866
867	if (qlen(&cl->q_) > qlimit(&cl->q_)) {
868		/* note: qlimit can be set to 0 or 1 */
869		rmc_drop_action(cl);
870		return (CLASSQEQ_DROPPED);
871	}
872	return (ret);
873}
874
875/*
876 * void
877 * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
878 *	classes to see if there are satified.
879 */
880
881static void
882rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
883{
884	int		 i;
885	rm_class_t	*p, *bp;
886
887	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
888		if ((bp = ifd->active_[i]) != NULL) {
889			p = bp;
890			do {
891				if (!rmc_satisfied(p, now)) {
892					ifd->cutoff_ = p->depth_;
893					return;
894				}
895				p = p->peer_;
896			} while (p != bp);
897		}
898	}
899
900	reset_cutoff(ifd);
901}
902
903/*
904 * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
905 */
906
907static int
908rmc_satisfied(struct rm_class *cl, struct timeval *now)
909{
910	rm_class_t	*p;
911
912	if (cl == NULL)
913		return (1);
914	if (TV_LT(now, &cl->undertime_))
915		return (1);
916	if (cl->depth_ == 0) {
917		if (!cl->sleeping_ && (qlen(&cl->q_) > cl->qthresh_))
918			return (0);
919		else
920			return (1);
921	}
922	if (cl->children_ != NULL) {
923		p = cl->children_;
924		while (p != NULL) {
925			if (!rmc_satisfied(p, now))
926				return (0);
927			p = p->next_;
928		}
929	}
930
931	return (1);
932}
933
934/*
935 * Return 1 if class 'cl' is under limit or can borrow from a parent,
936 * 0 if overlimit.  As a side-effect, this routine will invoke the
937 * class overlimit action if the class if overlimit.
938 */
939
940static int
941rmc_under_limit(struct rm_class *cl, struct timeval *now)
942{
943	rm_class_t	*p = cl;
944	rm_class_t	*top;
945	struct rm_ifdat	*ifd = cl->ifdat_;
946
947	ifd->borrowed_[ifd->qi_] = NULL;
948	/*
949	 * If cl is the root class, then always return that it is
950	 * underlimit.  Otherwise, check to see if the class is underlimit.
951	 */
952	if (cl->parent_ == NULL)
953		return (1);
954
955	if (cl->sleeping_) {
956		if (TV_LT(now, &cl->undertime_))
957			return (0);
958
959		CALLOUT_STOP(&cl->callout_);
960		cl->sleeping_ = 0;
961		cl->undertime_.tv_sec = 0;
962		return (1);
963	}
964
965	top = NULL;
966	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
967		if (((cl = cl->borrow_) == NULL) ||
968		    (cl->depth_ > ifd->cutoff_)) {
969#ifdef ADJUST_CUTOFF
970			if (cl != NULL)
971				/*
972				 * cutoff is taking effect, just
973				 * return false without calling
974				 * the delay action.
975				 */
976				return (0);
977#endif
978#ifdef BORROW_OFFTIME
979			/*
980			 * check if the class can borrow offtime too.
981			 * borrow offtime from the top of the borrow
982			 * chain if the top class is not overloaded.
983			 */
984			if (cl != NULL) {
985				/*
986				 * cutoff is taking effect, use this
987				 * class as top.
988				 */
989				top = cl;
990				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
991			}
992			if (top != NULL && top->avgidle_ == top->minidle_)
993				top = NULL;
994			p->overtime_ = *now;
995			(p->overlimit)(p, top);
996#else
997			p->overtime_ = *now;
998			(p->overlimit)(p, NULL);
999#endif
1000			return (0);
1001		}
1002		top = cl;
1003	}
1004
1005	if (cl != p)
1006		ifd->borrowed_[ifd->qi_] = cl;
1007	return (1);
1008}
1009
1010/*
1011 * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
1012 *	Packet-by-packet round robin.
1013 *
1014 * The heart of the weighted round-robin scheduler, which decides which
1015 * class next gets to send a packet.  Highest priority first, then
1016 * weighted round-robin within priorites.
1017 *
1018 * Each able-to-send class gets to send until its byte allocation is
1019 * exhausted.  Thus, the active pointer is only changed after a class has
1020 * exhausted its allocation.
1021 *
1022 * If the scheduler finds no class that is underlimit or able to borrow,
1023 * then the first class found that had a nonzero queue and is allowed to
1024 * borrow gets to send.
1025 */
1026
1027static struct mbuf *
1028_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t op)
1029{
1030	struct rm_class	*cl = NULL, *first = NULL;
1031	u_int32_t	 deficit;
1032	int		 cpri;
1033	struct mbuf	*m;
1034	struct timeval	 now;
1035
1036	RM_GETTIME(now);
1037
1038	/*
1039	 * if the driver polls the top of the queue and then removes
1040	 * the polled packet, we must return the same packet.
1041	 */
1042	if (op == CLASSQDQ_REMOVE && ifd->pollcache_) {
1043		cl = ifd->pollcache_;
1044		cpri = cl->pri_;
1045		if (ifd->efficient_) {
1046			/* check if this class is overlimit */
1047			if (cl->undertime_.tv_sec != 0 &&
1048			    rmc_under_limit(cl, &now) == 0)
1049				first = cl;
1050		}
1051		ifd->pollcache_ = NULL;
1052		goto _wrr_out;
1053	} else {
1054		/* mode == CLASSQDQ_POLL || pollcache == NULL */
1055		ifd->pollcache_ = NULL;
1056		ifd->borrowed_[ifd->qi_] = NULL;
1057	}
1058#ifdef ADJUST_CUTOFF
1059_again:
1060#endif
1061	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1062		if (ifd->na_[cpri] == 0)
1063			continue;
1064		deficit = 0;
1065		/*
1066		 * Loop through twice for a priority level, if some class
1067		 * was unable to send a packet the first round because
1068		 * of the weighted round-robin mechanism.
1069		 * During the second loop at this level, deficit==2.
1070		 * (This second loop is not needed if for every class,
1071		 * "M[cl->pri_])" times "cl->allotment" is greater than
1072		 * the byte size for the largest packet in the class.)
1073		 */
1074_wrr_loop:
1075		cl = ifd->active_[cpri];
1076		VERIFY(cl != NULL);
1077		do {
1078			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1079				cl->bytes_alloc_ += cl->w_allotment_;
1080			if (!qempty(&cl->q_)) {
1081				if ((cl->undertime_.tv_sec == 0) ||
1082				    rmc_under_limit(cl, &now)) {
1083					if (cl->bytes_alloc_ > 0 || deficit > 1)
1084						goto _wrr_out;
1085
1086					/* underlimit but no alloc */
1087					deficit = 1;
1088#if 1
1089					ifd->borrowed_[ifd->qi_] = NULL;
1090#endif
1091				} else if (first == NULL && cl->borrow_ != NULL)
1092					first = cl; /* borrowing candidate */
1093			}
1094
1095			cl->bytes_alloc_ = 0;
1096			cl = cl->peer_;
1097		} while (cl != ifd->active_[cpri]);
1098
1099		if (deficit == 1) {
1100			/* first loop found an underlimit class with deficit */
1101			/* Loop on same priority level, with new deficit.  */
1102			deficit = 2;
1103			goto _wrr_loop;
1104		}
1105	}
1106
1107#ifdef ADJUST_CUTOFF
1108	/*
1109	 * no underlimit class found.  if cutoff is taking effect,
1110	 * increase cutoff and try again.
1111	 */
1112	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1113		ifd->cutoff_++;
1114		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1115		goto _again;
1116	}
1117#endif /* ADJUST_CUTOFF */
1118	/*
1119	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1120	 * class we encounter will send a packet if all the classes
1121	 * of the link-sharing structure are overlimit.
1122	 */
1123	reset_cutoff(ifd);
1124	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1125
1126	if (!ifd->efficient_ || first == NULL)
1127		return (NULL);
1128
1129	cl = first;
1130	cpri = cl->pri_;
1131#if 0	/* too time-consuming for nothing */
1132	if (cl->sleeping_)
1133		CALLOUT_STOP(&cl->callout_);
1134	cl->sleeping_ = 0;
1135	cl->undertime_.tv_sec = 0;
1136#endif
1137	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1138	ifd->cutoff_ = cl->borrow_->depth_;
1139
1140	/*
1141	 * Deque the packet and do the book keeping...
1142	 */
1143_wrr_out:
1144	if (op == CLASSQDQ_REMOVE) {
1145		m = _rmc_getq(cl);
1146		if (m == NULL)
1147			return (NULL);
1148
1149		if (qempty(&cl->q_))
1150			ifd->na_[cpri]--;
1151
1152		/*
1153		 * Update class statistics and link data.
1154		 */
1155		if (cl->bytes_alloc_ > 0)
1156			cl->bytes_alloc_ -= m_pktlen(m);
1157
1158		if ((cl->bytes_alloc_ <= 0) || first == cl)
1159			ifd->active_[cl->pri_] = cl->peer_;
1160		else
1161			ifd->active_[cl->pri_] = cl;
1162
1163		ifd->class_[ifd->qi_] = cl;
1164		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1165		ifd->now_[ifd->qi_] = now;
1166		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1167		ifd->queued_++;
1168	} else {
1169		/* mode == ALTDQ_PPOLL */
1170		m = _rmc_pollq(cl);
1171		ifd->pollcache_ = cl;
1172	}
1173	return (m);
1174}
1175
1176/*
1177 * Dequeue & return next packet from the highest priority class that
1178 * has a packet to send & has enough allocation to send it.  This
1179 * routine is called by a driver whenever it needs a new packet to
1180 * output.
1181 */
1182static struct mbuf *
1183_rmc_prr_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t op)
1184{
1185	struct mbuf	*m;
1186	int		 cpri;
1187	struct rm_class	*cl, *first = NULL;
1188	struct timeval	 now;
1189
1190	RM_GETTIME(now);
1191
1192	/*
1193	 * if the driver polls the top of the queue and then removes
1194	 * the polled packet, we must return the same packet.
1195	 */
1196	if (op == CLASSQDQ_REMOVE && ifd->pollcache_) {
1197		cl = ifd->pollcache_;
1198		cpri = cl->pri_;
1199		ifd->pollcache_ = NULL;
1200		goto _prr_out;
1201	} else {
1202		/* mode == CLASSQDQ_POLL || pollcache == NULL */
1203		ifd->pollcache_ = NULL;
1204		ifd->borrowed_[ifd->qi_] = NULL;
1205	}
1206#ifdef ADJUST_CUTOFF
1207_again:
1208#endif
1209	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1210		if (ifd->na_[cpri] == 0)
1211			continue;
1212		cl = ifd->active_[cpri];
1213		VERIFY(cl != NULL);
1214		do {
1215			if (!qempty(&cl->q_)) {
1216				if ((cl->undertime_.tv_sec == 0) ||
1217				    rmc_under_limit(cl, &now))
1218					goto _prr_out;
1219				if (first == NULL && cl->borrow_ != NULL)
1220					first = cl;
1221			}
1222			cl = cl->peer_;
1223		} while (cl != ifd->active_[cpri]);
1224	}
1225
1226#ifdef ADJUST_CUTOFF
1227	/*
1228	 * no underlimit class found.  if cutoff is taking effect, increase
1229	 * cutoff and try again.
1230	 */
1231	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1232		ifd->cutoff_++;
1233		goto _again;
1234	}
1235#endif /* ADJUST_CUTOFF */
1236	/*
1237	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1238	 * class we encounter will send a packet if all the classes
1239	 * of the link-sharing structure are overlimit.
1240	 */
1241	reset_cutoff(ifd);
1242	if (!ifd->efficient_ || first == NULL)
1243		return (NULL);
1244
1245	cl = first;
1246	cpri = cl->pri_;
1247#if 0	/* too time-consuming for nothing */
1248	if (cl->sleeping_)
1249		CALLOUT_STOP(&cl->callout_);
1250	cl->sleeping_ = 0;
1251	cl->undertime_.tv_sec = 0;
1252#endif
1253	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1254	ifd->cutoff_ = cl->borrow_->depth_;
1255
1256	/*
1257	 * Deque the packet and do the book keeping...
1258	 */
1259_prr_out:
1260	if (op == CLASSQDQ_REMOVE) {
1261		m = _rmc_getq(cl);
1262		if (m == NULL)
1263			return (NULL);
1264
1265		if (qempty(&cl->q_))
1266			ifd->na_[cpri]--;
1267
1268		ifd->active_[cpri] = cl->peer_;
1269
1270		ifd->class_[ifd->qi_] = cl;
1271		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1272		ifd->now_[ifd->qi_] = now;
1273		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1274		ifd->queued_++;
1275	} else {
1276		/* mode == CLASSQDQ_POLL */
1277		m = _rmc_pollq(cl);
1278		ifd->pollcache_ = cl;
1279	}
1280	return (m);
1281}
1282
1283/*
1284 * struct mbuf *
1285 * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1286 *	is invoked by the packet driver to get the next packet to be
1287 *	dequeued and output on the link.  If WRR is enabled, then the
1288 *	WRR dequeue next routine will determine the next packet to sent.
1289 *	Otherwise, packet-by-packet round robin is invoked.
1290 *
1291 *	Returns:	NULL, if a packet is not available or if all
1292 *			classes are overlimit.
1293 *
1294 *			Otherwise, Pointer to the next packet.
1295 */
1296
1297struct mbuf *
1298rmc_dequeue_next(struct rm_ifdat *ifd, cqdq_op_t mode)
1299{
1300	if (ifd->queued_ >= ifd->maxqueued_)
1301		return (NULL);
1302	else if (ifd->wrr_)
1303		return (_rmc_wrr_dequeue_next(ifd, mode));
1304	else
1305		return (_rmc_prr_dequeue_next(ifd, mode));
1306}
1307
1308/*
1309 * Update the utilization estimate for the packet that just completed.
1310 * The packet's class & the parent(s) of that class all get their
1311 * estimators updated.  This routine is called by the driver's output-
1312 * packet-completion interrupt service routine.
1313 */
1314
1315/*
1316 * a macro to approximate "divide by 1000" that gives 0.000999,
1317 * if a value has enough effective digits.
1318 * (on pentium, mul takes 9 cycles but div takes 46!)
1319 */
1320#define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1321void
1322rmc_update_class_util(struct rm_ifdat *ifd)
1323{
1324	int		 idle, avgidle, pktlen;
1325	int		 pkt_time, tidle;
1326	rm_class_t	*cl, *borrowed;
1327	rm_class_t	*borrows;
1328	struct timeval	*nowp;
1329
1330	/*
1331	 * Get the most recent completed class.
1332	 */
1333	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1334		return;
1335
1336	pktlen = ifd->curlen_[ifd->qo_];
1337	borrowed = ifd->borrowed_[ifd->qo_];
1338	borrows = borrowed;
1339
1340	PKTCNTR_ADD(&cl->stats_.xmit_cnt, 1, pktlen);
1341
1342	/*
1343	 * Run estimator on class and its ancestors.
1344	 */
1345	/*
1346	 * rm_update_class_util is designed to be called when the
1347	 * transfer is completed from a xmit complete interrupt,
1348	 * but most drivers don't implement an upcall for that.
1349	 * so, just use estimated completion time.
1350	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1351	 */
1352	nowp = &ifd->now_[ifd->qo_];
1353	/* get pkt_time (for link) in usec */
1354#if 1  /* use approximation */
1355	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1356	pkt_time = NSEC_TO_USEC(pkt_time);
1357#else
1358	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1359#endif
1360#if 1 /* ALTQ4PPP */
1361	if (TV_LT(nowp, &ifd->ifnow_)) {
1362		int iftime;
1363
1364		/*
1365		 * make sure the estimated completion time does not go
1366		 * too far.  it can happen when the link layer supports
1367		 * data compression or the interface speed is set to
1368		 * a much lower value.
1369		 */
1370		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1371		if (iftime+pkt_time < ifd->maxiftime_) {
1372			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1373		} else {
1374			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1375		}
1376	} else {
1377		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1378	}
1379#else
1380	if (TV_LT(nowp, &ifd->ifnow_)) {
1381		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1382	} else {
1383		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1384	}
1385#endif
1386
1387	while (cl != NULL) {
1388		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1389		if (idle >= 2000000)
1390			/*
1391			 * this class is idle enough, reset avgidle.
1392			 * (TV_DELTA returns 2000000 us when delta is large.)
1393			 */
1394			cl->avgidle_ = cl->maxidle_;
1395
1396		/* get pkt_time (for class) in usec */
1397#if 1  /* use approximation */
1398		pkt_time = pktlen * cl->ns_per_byte_;
1399		pkt_time = NSEC_TO_USEC(pkt_time);
1400#else
1401		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1402#endif
1403		idle -= pkt_time;
1404
1405		avgidle = cl->avgidle_;
1406		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1407		cl->avgidle_ = avgidle;
1408
1409		/* Are we overlimit ? */
1410		if (avgidle <= 0) {
1411			CBQTRACE(rmc_update_class_util, 'milo',
1412			    cl->stats_.handle);
1413			/*
1414			 * need some lower bound for avgidle, otherwise
1415			 * a borrowing class gets unbounded penalty.
1416			 */
1417			if (avgidle < cl->minidle_)
1418				avgidle = cl->avgidle_ = cl->minidle_;
1419
1420			/* set next idle to make avgidle 0 */
1421			tidle = pkt_time +
1422			    (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1423			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1424			++cl->stats_.over;
1425		} else {
1426			cl->avgidle_ =
1427			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1428			cl->undertime_.tv_sec = 0;
1429			if (cl->sleeping_) {
1430				CALLOUT_STOP(&cl->callout_);
1431				cl->sleeping_ = 0;
1432			}
1433		}
1434
1435		if (borrows != NULL) {
1436			if (borrows != cl)
1437				++cl->stats_.borrows;
1438			else
1439				borrows = NULL;
1440		}
1441		cl->last_ = ifd->ifnow_;
1442		cl->last_pkttime_ = pkt_time;
1443
1444#if 1
1445		if (cl->parent_ == NULL) {
1446			/* take stats of root class */
1447			PKTCNTR_ADD(&cl->stats_.xmit_cnt, 1, pktlen);
1448		}
1449#endif
1450
1451		cl = cl->parent_;
1452	}
1453
1454	/*
1455	 * Check to see if cutoff needs to set to a new level.
1456	 */
1457	cl = ifd->class_[ifd->qo_];
1458	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1459		if ((qlen(&cl->q_) <= 0) ||
1460		    TV_LT(nowp, &borrowed->undertime_)) {
1461			rmc_tl_satisfied(ifd, nowp);
1462			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1463		} else {
1464			ifd->cutoff_ = borrowed->depth_;
1465			CBQTRACE(rmc_update_class_util, 'ffob',
1466			    borrowed->depth_);
1467		}
1468	}
1469
1470	/*
1471	 * Release class slot
1472	 */
1473	ifd->borrowed_[ifd->qo_] = NULL;
1474	ifd->class_[ifd->qo_] = NULL;
1475	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1476	ifd->queued_--;
1477}
1478
1479/*
1480 * void
1481 * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1482 *	over-limit action routines.  These get invoked by rmc_under_limit()
1483 *	if a class with packets to send if over its bandwidth limit & can't
1484 *	borrow from a parent class.
1485 *
1486 *	Returns: NONE
1487 */
1488
1489static void
1490rmc_drop_action(struct rm_class *cl)
1491{
1492	struct rm_ifdat	*ifd = cl->ifdat_;
1493
1494	VERIFY(qlen(&cl->q_) > 0);
1495	IFCQ_CONVERT_LOCK(ifd->ifq_);
1496	_rmc_dropq(cl);
1497	if (qempty(&cl->q_))
1498		ifd->na_[cl->pri_]--;
1499}
1500
1501void
1502rmc_drop(struct rm_class *cl, u_int32_t flow, u_int32_t *packets,
1503    u_int32_t *bytes)
1504{
1505	struct rm_ifdat	*ifd = cl->ifdat_;
1506	struct ifclassq *ifq = ifd->ifq_;
1507	u_int32_t pkt = 0, len = 0, qlen;
1508
1509	if ((qlen = qlen(&cl->q_)) != 0) {
1510		IFCQ_CONVERT_LOCK(ifq);
1511#if CLASSQ_RIO
1512		if (q_is_rio(&cl->q_))
1513			rio_purgeq(cl->rio_, &cl->q_, flow, &pkt, &len);
1514		else
1515#endif /* CLASSQ_RIO */
1516#if CLASSQ_RED
1517		if (q_is_red(&cl->q_))
1518			red_purgeq(cl->red_, &cl->q_, flow, &pkt, &len);
1519		else
1520#endif /* CLASSQ_RED */
1521#if CLASSQ_BLUE
1522		if (q_is_blue(&cl->q_))
1523			blue_purgeq(cl->blue_, &cl->q_, flow, &pkt, &len);
1524		else
1525#endif /* CLASSQ_BLUE */
1526		if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL)
1527			sfb_purgeq(cl->sfb_, &cl->q_, flow, &pkt, &len);
1528		else
1529			_flushq_flow(&cl->q_, flow, &pkt, &len);
1530
1531		if (pkt > 0) {
1532			VERIFY(qlen(&cl->q_) == (qlen - pkt));
1533
1534			PKTCNTR_ADD(&cl->stats_.drop_cnt, pkt, len);
1535			IFCQ_DROP_ADD(ifq, pkt, len);
1536
1537			VERIFY(((signed)IFCQ_LEN(ifq) - pkt) >= 0);
1538			IFCQ_LEN(ifq) -= pkt;
1539
1540			if (qempty(&cl->q_))
1541				ifd->na_[cl->pri_]--;
1542		}
1543	}
1544	if (packets != NULL)
1545		*packets = pkt;
1546	if (bytes != NULL)
1547		*bytes = len;
1548}
1549
1550void
1551rmc_dropall(struct rm_class *cl)
1552{
1553	rmc_drop(cl, 0, NULL, NULL);
1554}
1555
1556/*
1557 * void
1558 * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1559 *	delay action routine.  It is invoked via rmc_under_limit when the
1560 *	packet is discoverd to be overlimit.
1561 *
1562 *	If the delay action is result of borrow class being overlimit, then
1563 *	delay for the offtime of the borrowing class that is overlimit.
1564 *
1565 *	Returns: NONE
1566 */
1567
1568void
1569rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1570{
1571	int	ndelay, t, extradelay;
1572
1573	cl->stats_.overactions++;
1574	TV_DELTA(&cl->undertime_, &cl->overtime_, ndelay);
1575#ifndef BORROW_OFFTIME
1576	ndelay += cl->offtime_;
1577#endif
1578
1579	if (!cl->sleeping_) {
1580		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1581#ifdef BORROW_OFFTIME
1582		if (borrow != NULL)
1583			extradelay = borrow->offtime_;
1584		else
1585#endif
1586			extradelay = cl->offtime_;
1587
1588		/*
1589		 * XXX recalculate suspend time:
1590		 * current undertime is (tidle + pkt_time) calculated
1591		 * from the last transmission.
1592		 *	tidle: time required to bring avgidle back to 0
1593		 *	pkt_time: target waiting time for this class
1594		 * we need to replace pkt_time by offtime
1595		 */
1596		extradelay -= cl->last_pkttime_;
1597		if (extradelay > 0) {
1598			TV_ADD_DELTA(&cl->undertime_, extradelay,
1599			    &cl->undertime_);
1600			ndelay += extradelay;
1601		}
1602
1603		cl->sleeping_ = 1;
1604		cl->stats_.delays++;
1605
1606		/*
1607		 * Since packets are phased randomly with respect to the
1608		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1609		 * short time so we have to wait for at least two ticks.
1610		 * NOTE:  If there's no other traffic, we need the timer as
1611		 * a 'backstop' to restart this class.
1612		 */
1613		if (ndelay > tick * 2) {
1614			/*
1615			 * FreeBSD rounds up the tick;
1616			 * other BSDs round down the tick.
1617			 */
1618			t = hzto(&cl->undertime_) + 1;
1619		} else {
1620			t = 2;
1621		}
1622		CALLOUT_RESET(&cl->callout_, t,
1623		    (timeout_t *)rmc_restart, (caddr_t)cl);
1624	}
1625}
1626
1627/*
1628 * void
1629 * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1630 *	called by the system timer code & is responsible checking if the
1631 *	class is still sleeping (it might have been restarted as a side
1632 *	effect of the queue scan on a packet arrival) and, if so, restarting
1633 *	output for the class.  Inspecting the class state & restarting output
1634 *	require locking the class structure.  In general the driver is
1635 *	responsible for locking but this is the only routine that is not
1636 *	called directly or indirectly from the interface driver so it has
1637 *	know about system locking conventions.
1638 *
1639 *	Returns:	NONE
1640 */
1641
1642static void
1643rmc_restart(struct rm_class *cl)
1644{
1645	struct rm_ifdat	*ifd = cl->ifdat_;
1646
1647	if (cl->sleeping_) {
1648		cl->sleeping_ = 0;
1649		cl->undertime_.tv_sec = 0;
1650
1651		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1652			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1653			(ifd->restart)(ifd->ifq_);
1654		}
1655	}
1656}
1657
1658/*
1659 * void
1660 * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1661 *	handling routine for the root class of the link sharing structure.
1662 *
1663 *	Returns: NONE
1664 */
1665static void
1666rmc_root_overlimit(struct rm_class *cl,
1667    struct rm_class *borrow)
1668{
1669#pragma unused(cl, borrow)
1670	panic("rmc_root_overlimit");
1671}
1672
1673/*
1674 * Packet Queue handling routines.  Eventually, this is to localize the
1675 *	effects on the code whether queues are red queues or droptail
1676 *	queues.
1677 */
1678
1679static int
1680_rmc_addq(rm_class_t *cl, struct mbuf *m, struct pf_mtag *t)
1681{
1682#if CLASSQ_RIO
1683	if (q_is_rio(&cl->q_))
1684		return (rio_addq(cl->rio_, &cl->q_, m, t));
1685	else
1686#endif /* CLASSQ_RIO */
1687#if CLASSQ_RED
1688	if (q_is_red(&cl->q_))
1689		return (red_addq(cl->red_, &cl->q_, m, t));
1690	else
1691#endif /* CLASSQ_RED */
1692#if CLASSQ_BLUE
1693	if (q_is_blue(&cl->q_))
1694		return (blue_addq(cl->blue_, &cl->q_, m, t));
1695	else
1696#endif /* CLASSQ_BLUE */
1697	if (q_is_sfb(&cl->q_)) {
1698		if (cl->sfb_ == NULL) {
1699			struct ifclassq *ifq = cl->ifdat_->ifq_;
1700			struct ifnet *ifp = ifq->ifcq_ifp;
1701
1702			VERIFY(cl->flags_ & RMCF_LAZY);
1703			IFCQ_CONVERT_LOCK(ifq);
1704
1705			cl->sfb_ = sfb_alloc(ifp, cl->stats_.handle,
1706			    qlimit(&cl->q_), cl->qflags_);
1707			if (cl->sfb_ == NULL) {
1708				/* fall back to droptail */
1709				qtype(&cl->q_) = Q_DROPTAIL;
1710				cl->flags_ &= ~RMCF_SFB;
1711				cl->qflags_ &= ~(SFBF_ECN | SFBF_FLOWCTL);
1712
1713				log(LOG_ERR, "%s: CBQ SFB lazy allocation "
1714				    "failed for qid=%d pri=%d, falling back "
1715				    "to DROPTAIL\n", if_name(ifp),
1716				    cl->stats_.handle, cl->pri_);
1717			}
1718		}
1719		if (cl->sfb_ != NULL)
1720			return (sfb_addq(cl->sfb_, &cl->q_, m, t));
1721	}
1722#if PF_ECN
1723	else if (cl->flags_ & RMCF_CLEARDSCP)
1724		write_dsfield(m, t, 0);
1725#endif /* PF_ECN */
1726
1727	/* test for qlen > qlimit is done by caller */
1728	_addq(&cl->q_, m);
1729	return (0);
1730}
1731
1732/* note: _rmc_dropq is not called for red */
1733static void
1734_rmc_dropq(rm_class_t *cl)
1735{
1736	struct mbuf *m;
1737
1738	if ((m = _rmc_getq(cl)) != NULL)
1739		m_freem(m);
1740}
1741
1742static struct mbuf *
1743_rmc_getq(rm_class_t *cl)
1744{
1745#if CLASSQ_RIO
1746	if (q_is_rio(&cl->q_))
1747		return (rio_getq(cl->rio_, &cl->q_));
1748	else
1749#endif /* CLASSQ_RIO */
1750#if CLASSQ_RED
1751	if (q_is_red(&cl->q_))
1752		return (red_getq(cl->red_, &cl->q_));
1753	else
1754#endif /* CLASSQ_RED */
1755#if CLASSQ_BLUE
1756	if (q_is_blue(&cl->q_))
1757		return (blue_getq(cl->blue_, &cl->q_));
1758	else
1759#endif /* CLASSQ_BLUE */
1760	if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL)
1761		return (sfb_getq(cl->sfb_, &cl->q_));
1762
1763	return (_getq(&cl->q_));
1764}
1765
1766static struct mbuf *
1767_rmc_pollq(rm_class_t *cl)
1768{
1769	return (qhead(&cl->q_));
1770}
1771
1772void
1773rmc_updateq(rm_class_t *cl, cqev_t ev)
1774{
1775#if CLASSQ_RIO
1776	if (q_is_rio(&cl->q_))
1777		return (rio_updateq(cl->rio_, ev));
1778#endif /* CLASSQ_RIO */
1779#if CLASSQ_RED
1780	if (q_is_red(&cl->q_))
1781		return (red_updateq(cl->red_, ev));
1782#endif /* CLASSQ_RED */
1783#if CLASSQ_BLUE
1784	if (q_is_blue(&cl->q_))
1785		return (blue_updateq(cl->blue_, ev));
1786#endif /* CLASSQ_BLUE */
1787	if (q_is_sfb(&cl->q_) && cl->sfb_ != NULL)
1788		return (sfb_updateq(cl->sfb_, ev));
1789}
1790
1791#ifdef CBQ_TRACE
1792
1793struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1794struct cbqtrace		*cbqtrace_ptr = NULL;
1795int			 cbqtrace_count;
1796
1797/*
1798 * DDB hook to trace cbq events:
1799 *  the last 1024 events are held in a circular buffer.
1800 *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1801 */
1802void cbqtrace_dump(int);
1803static char *rmc_funcname(void *);
1804
1805static struct rmc_funcs {
1806	void	*func;
1807	char	*name;
1808} rmc_funcs[] =
1809{
1810	rmc_init,		"rmc_init",
1811	rmc_queue_packet,	"rmc_queue_packet",
1812	rmc_under_limit,	"rmc_under_limit",
1813	rmc_update_class_util,	"rmc_update_class_util",
1814	rmc_delay_action,	"rmc_delay_action",
1815	rmc_restart,		"rmc_restart",
1816	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1817	NULL,			NULL
1818};
1819
1820static char *
1821rmc_funcname(void *func)
1822{
1823	struct rmc_funcs *fp;
1824
1825	for (fp = rmc_funcs; fp->func != NULL; fp++)
1826		if (fp->func == func)
1827			return (fp->name);
1828	return ("unknown");
1829}
1830
1831void
1832cbqtrace_dump(int counter)
1833{
1834	int	 i, *p;
1835	char	*cp;
1836
1837	counter = counter % NCBQTRACE;
1838	p = (int *)&cbqtrace_buffer[counter];
1839
1840	for (i = 0; i < 20; i++) {
1841		log(LOG_DEBUG, "[0x%x] ", *p++);
1842		log(LOG_DEBUG, "%s: ", rmc_funcname((void *)*p++));
1843		cp = (char *)p++;
1844		log(LOG_DEBUG, "%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1845		log(LOG_DEBUG, "%d\n", *p++);
1846
1847		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1848			p = (int *)cbqtrace_buffer;
1849	}
1850}
1851#endif /* CBQ_TRACE */
1852#endif /* PKTSCHED_CBQ */
1853