altq_rmclass.c revision 240646
1/*	$FreeBSD: head/sys/contrib/altq/altq/altq_rmclass.c 240646 2012-09-18 12:34:35Z glebius $	*/
2/*	$KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $	*/
3
4/*
5 * Copyright (c) 1991-1997 Regents of the University of California.
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 *    notice, this list of conditions and the following disclaimer in the
15 *    documentation and/or other materials provided with the distribution.
16 * 3. All advertising materials mentioning features or use of this software
17 *    must display the following acknowledgement:
18 *      This product includes software developed by the Network Research
19 *      Group at Lawrence Berkeley Laboratory.
20 * 4. Neither the name of the University nor of the Laboratory may be used
21 *    to endorse or promote products derived from this software without
22 *    specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * SUCH DAMAGE.
35 *
36 * LBL code modified by speer@eng.sun.com, May 1977.
37 * For questions and/or comments, please send mail to cbq@ee.lbl.gov
38 */
39
40#ident "@(#)rm_class.c  1.48     97/12/05 SMI"
41
42#if defined(__FreeBSD__) || defined(__NetBSD__)
43#include "opt_altq.h"
44#include "opt_inet.h"
45#ifdef __FreeBSD__
46#include "opt_inet6.h"
47#endif
48#endif /* __FreeBSD__ || __NetBSD__ */
49#ifdef ALTQ_CBQ	/* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
50
51#include <sys/param.h>
52#include <sys/malloc.h>
53#include <sys/mbuf.h>
54#include <sys/socket.h>
55#include <sys/systm.h>
56#include <sys/errno.h>
57#include <sys/time.h>
58#ifdef ALTQ3_COMPAT
59#include <sys/kernel.h>
60#endif
61
62#include <net/if.h>
63#ifdef ALTQ3_COMPAT
64#include <netinet/in.h>
65#include <netinet/in_systm.h>
66#include <netinet/ip.h>
67#endif
68
69#include <altq/altq.h>
70#include <altq/altq_rmclass.h>
71#include <altq/altq_rmclass_debug.h>
72#include <altq/altq_red.h>
73#include <altq/altq_rio.h>
74
75/*
76 * Local Macros
77 */
78
79#define	reset_cutoff(ifd)	{ ifd->cutoff_ = RM_MAXDEPTH; }
80
81/*
82 * Local routines.
83 */
84
85static int	rmc_satisfied(struct rm_class *, struct timeval *);
86static void	rmc_wrr_set_weights(struct rm_ifdat *);
87static void	rmc_depth_compute(struct rm_class *);
88static void	rmc_depth_recompute(rm_class_t *);
89
90static mbuf_t	*_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
91static mbuf_t	*_rmc_prr_dequeue_next(struct rm_ifdat *, int);
92
93static int	_rmc_addq(rm_class_t *, mbuf_t *);
94static void	_rmc_dropq(rm_class_t *);
95static mbuf_t	*_rmc_getq(rm_class_t *);
96static mbuf_t	*_rmc_pollq(rm_class_t *);
97
98static int	rmc_under_limit(struct rm_class *, struct timeval *);
99static void	rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
100static void	rmc_drop_action(struct rm_class *);
101static void	rmc_restart(struct rm_class *);
102static void	rmc_root_overlimit(struct rm_class *, struct rm_class *);
103
104#define	BORROW_OFFTIME
105/*
106 * BORROW_OFFTIME (experimental):
107 * borrow the offtime of the class borrowing from.
108 * the reason is that when its own offtime is set, the class is unable
109 * to borrow much, especially when cutoff is taking effect.
110 * but when the borrowed class is overloaded (advidle is close to minidle),
111 * use the borrowing class's offtime to avoid overload.
112 */
113#define	ADJUST_CUTOFF
114/*
115 * ADJUST_CUTOFF (experimental):
116 * if no underlimit class is found due to cutoff, increase cutoff and
117 * retry the scheduling loop.
118 * also, don't invoke delay_actions while cutoff is taking effect,
119 * since a sleeping class won't have a chance to be scheduled in the
120 * next loop.
121 *
122 * now heuristics for setting the top-level variable (cutoff_) becomes:
123 *	1. if a packet arrives for a not-overlimit class, set cutoff
124 *	   to the depth of the class.
125 *	2. if cutoff is i, and a packet arrives for an overlimit class
126 *	   with an underlimit ancestor at a lower level than i (say j),
127 *	   then set cutoff to j.
128 *	3. at scheduling a packet, if there is no underlimit class
129 *	   due to the current cutoff level, increase cutoff by 1 and
130 *	   then try to schedule again.
131 */
132
133/*
134 * rm_class_t *
135 * rmc_newclass(...) - Create a new resource management class at priority
136 * 'pri' on the interface given by 'ifd'.
137 *
138 * nsecPerByte  is the data rate of the interface in nanoseconds/byte.
139 *              E.g., 800 for a 10Mb/s ethernet.  If the class gets less
140 *              than 100% of the bandwidth, this number should be the
141 *              'effective' rate for the class.  Let f be the
142 *              bandwidth fraction allocated to this class, and let
143 *              nsPerByte be the data rate of the output link in
144 *              nanoseconds/byte.  Then nsecPerByte is set to
145 *              nsPerByte / f.  E.g., 1600 (= 800 / .5)
146 *              for a class that gets 50% of an ethernet's bandwidth.
147 *
148 * action       the routine to call when the class is over limit.
149 *
150 * maxq         max allowable queue size for class (in packets).
151 *
152 * parent       parent class pointer.
153 *
154 * borrow       class to borrow from (should be either 'parent' or null).
155 *
156 * maxidle      max value allowed for class 'idle' time estimate (this
157 *              parameter determines how large an initial burst of packets
158 *              can be before overlimit action is invoked.
159 *
160 * offtime      how long 'delay' action will delay when class goes over
161 *              limit (this parameter determines the steady-state burst
162 *              size when a class is running over its limit).
163 *
164 * Maxidle and offtime have to be computed from the following:  If the
165 * average packet size is s, the bandwidth fraction allocated to this
166 * class is f, we want to allow b packet bursts, and the gain of the
167 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
168 *
169 *   ptime = s * nsPerByte * (1 - f) / f
170 *   maxidle = ptime * (1 - g^b) / g^b
171 *   minidle = -ptime * (1 / (f - 1))
172 *   offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
173 *
174 * Operationally, it's convenient to specify maxidle & offtime in units
175 * independent of the link bandwidth so the maxidle & offtime passed to
176 * this routine are the above values multiplied by 8*f/(1000*nsPerByte).
177 * (The constant factor is a scale factor needed to make the parameters
178 * integers.  This scaling also means that the 'unscaled' values of
179 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
180 * not nanoseconds.)  Also note that the 'idle' filter computation keeps
181 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
182 * maxidle also must be scaled upward by this value.  Thus, the passed
183 * values for maxidle and offtime can be computed as follows:
184 *
185 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
186 * offtime = offtime * 8 / (1000 * nsecPerByte)
187 *
188 * When USE_HRTIME is employed, then maxidle and offtime become:
189 * 	maxidle = maxilde * (8.0 / nsecPerByte);
190 * 	offtime = offtime * (8.0 / nsecPerByte);
191 */
192struct rm_class *
193rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
194    void (*action)(rm_class_t *, rm_class_t *), int maxq,
195    struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
196    int minidle, u_int offtime, int pktsize, int flags)
197{
198	struct rm_class	*cl;
199	struct rm_class	*peer;
200	int		 s;
201
202	if (pri >= RM_MAXPRIO)
203		return (NULL);
204#ifndef ALTQ_RED
205	if (flags & RMCF_RED) {
206#ifdef ALTQ_DEBUG
207		printf("rmc_newclass: RED not configured for CBQ!\n");
208#endif
209		return (NULL);
210	}
211#endif
212#ifndef ALTQ_RIO
213	if (flags & RMCF_RIO) {
214#ifdef ALTQ_DEBUG
215		printf("rmc_newclass: RIO not configured for CBQ!\n");
216#endif
217		return (NULL);
218	}
219#endif
220
221	cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_NOWAIT | M_ZERO);
222	if (cl == NULL)
223		return (NULL);
224	CALLOUT_INIT(&cl->callout_);
225	cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
226	if (cl->q_ == NULL) {
227		free(cl, M_DEVBUF);
228		return (NULL);
229	}
230
231	/*
232	 * Class initialization.
233	 */
234	cl->children_ = NULL;
235	cl->parent_ = parent;
236	cl->borrow_ = borrow;
237	cl->leaf_ = 1;
238	cl->ifdat_ = ifd;
239	cl->pri_ = pri;
240	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
241	cl->depth_ = 0;
242	cl->qthresh_ = 0;
243	cl->ns_per_byte_ = nsecPerByte;
244
245	qlimit(cl->q_) = maxq;
246	qtype(cl->q_) = Q_DROPHEAD;
247	qlen(cl->q_) = 0;
248	cl->flags_ = flags;
249
250#if 1 /* minidle is also scaled in ALTQ */
251	cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
252	if (cl->minidle_ > 0)
253		cl->minidle_ = 0;
254#else
255	cl->minidle_ = minidle;
256#endif
257	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
258	if (cl->maxidle_ == 0)
259		cl->maxidle_ = 1;
260#if 1 /* offtime is also scaled in ALTQ */
261	cl->avgidle_ = cl->maxidle_;
262	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
263	if (cl->offtime_ == 0)
264		cl->offtime_ = 1;
265#else
266	cl->avgidle_ = 0;
267	cl->offtime_ = (offtime * nsecPerByte) / 8;
268#endif
269	cl->overlimit = action;
270
271#ifdef ALTQ_RED
272	if (flags & (RMCF_RED|RMCF_RIO)) {
273		int red_flags, red_pkttime;
274
275		red_flags = 0;
276		if (flags & RMCF_ECN)
277			red_flags |= REDF_ECN;
278		if (flags & RMCF_FLOWVALVE)
279			red_flags |= REDF_FLOWVALVE;
280#ifdef ALTQ_RIO
281		if (flags & RMCF_CLEARDSCP)
282			red_flags |= RIOF_CLEARDSCP;
283#endif
284		red_pkttime = nsecPerByte * pktsize  / 1000;
285
286		if (flags & RMCF_RED) {
287			cl->red_ = red_alloc(0, 0,
288			    qlimit(cl->q_) * 10/100,
289			    qlimit(cl->q_) * 30/100,
290			    red_flags, red_pkttime);
291			if (cl->red_ != NULL)
292				qtype(cl->q_) = Q_RED;
293		}
294#ifdef ALTQ_RIO
295		else {
296			cl->red_ = (red_t *)rio_alloc(0, NULL,
297						      red_flags, red_pkttime);
298			if (cl->red_ != NULL)
299				qtype(cl->q_) = Q_RIO;
300		}
301#endif
302	}
303#endif /* ALTQ_RED */
304
305	/*
306	 * put the class into the class tree
307	 */
308#ifdef __NetBSD__
309	s = splnet();
310#else
311	s = splimp();
312#endif
313	IFQ_LOCK(ifd->ifq_);
314	if ((peer = ifd->active_[pri]) != NULL) {
315		/* find the last class at this pri */
316		cl->peer_ = peer;
317		while (peer->peer_ != ifd->active_[pri])
318			peer = peer->peer_;
319		peer->peer_ = cl;
320	} else {
321		ifd->active_[pri] = cl;
322		cl->peer_ = cl;
323	}
324
325	if (cl->parent_) {
326		cl->next_ = parent->children_;
327		parent->children_ = cl;
328		parent->leaf_ = 0;
329	}
330
331	/*
332	 * Compute the depth of this class and its ancestors in the class
333	 * hierarchy.
334	 */
335	rmc_depth_compute(cl);
336
337	/*
338	 * If CBQ's WRR is enabled, then initialize the class WRR state.
339	 */
340	if (ifd->wrr_) {
341		ifd->num_[pri]++;
342		ifd->alloc_[pri] += cl->allotment_;
343		rmc_wrr_set_weights(ifd);
344	}
345	IFQ_UNLOCK(ifd->ifq_);
346	splx(s);
347	return (cl);
348}
349
350int
351rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
352    int minidle, u_int offtime, int pktsize)
353{
354	struct rm_ifdat	*ifd;
355	u_int		 old_allotment;
356	int		 s;
357
358	ifd = cl->ifdat_;
359	old_allotment = cl->allotment_;
360
361#ifdef __NetBSD__
362	s = splnet();
363#else
364	s = splimp();
365#endif
366	IFQ_LOCK(ifd->ifq_);
367	cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
368	cl->qthresh_ = 0;
369	cl->ns_per_byte_ = nsecPerByte;
370
371	qlimit(cl->q_) = maxq;
372
373#if 1 /* minidle is also scaled in ALTQ */
374	cl->minidle_ = (minidle * nsecPerByte) / 8;
375	if (cl->minidle_ > 0)
376		cl->minidle_ = 0;
377#else
378	cl->minidle_ = minidle;
379#endif
380	cl->maxidle_ = (maxidle * nsecPerByte) / 8;
381	if (cl->maxidle_ == 0)
382		cl->maxidle_ = 1;
383#if 1 /* offtime is also scaled in ALTQ */
384	cl->avgidle_ = cl->maxidle_;
385	cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
386	if (cl->offtime_ == 0)
387		cl->offtime_ = 1;
388#else
389	cl->avgidle_ = 0;
390	cl->offtime_ = (offtime * nsecPerByte) / 8;
391#endif
392
393	/*
394	 * If CBQ's WRR is enabled, then initialize the class WRR state.
395	 */
396	if (ifd->wrr_) {
397		ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
398		rmc_wrr_set_weights(ifd);
399	}
400	IFQ_UNLOCK(ifd->ifq_);
401	splx(s);
402	return (0);
403}
404
405/*
406 * static void
407 * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
408 *	the appropriate run robin weights for the CBQ weighted round robin
409 *	algorithm.
410 *
411 *	Returns: NONE
412 */
413
414static void
415rmc_wrr_set_weights(struct rm_ifdat *ifd)
416{
417	int		i;
418	struct rm_class	*cl, *clh;
419
420	for (i = 0; i < RM_MAXPRIO; i++) {
421		/*
422		 * This is inverted from that of the simulator to
423		 * maintain precision.
424		 */
425		if (ifd->num_[i] == 0)
426			ifd->M_[i] = 0;
427		else
428			ifd->M_[i] = ifd->alloc_[i] /
429				(ifd->num_[i] * ifd->maxpkt_);
430		/*
431		 * Compute the weighted allotment for each class.
432		 * This takes the expensive div instruction out
433		 * of the main loop for the wrr scheduling path.
434		 * These only get recomputed when a class comes or
435		 * goes.
436		 */
437		if (ifd->active_[i] != NULL) {
438			clh = cl = ifd->active_[i];
439			do {
440				/* safe-guard for slow link or alloc_ == 0 */
441				if (ifd->M_[i] == 0)
442					cl->w_allotment_ = 0;
443				else
444					cl->w_allotment_ = cl->allotment_ /
445						ifd->M_[i];
446				cl = cl->peer_;
447			} while ((cl != NULL) && (cl != clh));
448		}
449	}
450}
451
452int
453rmc_get_weight(struct rm_ifdat *ifd, int pri)
454{
455	if ((pri >= 0) && (pri < RM_MAXPRIO))
456		return (ifd->M_[pri]);
457	else
458		return (0);
459}
460
461/*
462 * static void
463 * rmc_depth_compute(struct rm_class *cl) - This function computes the
464 *	appropriate depth of class 'cl' and its ancestors.
465 *
466 *	Returns:	NONE
467 */
468
469static void
470rmc_depth_compute(struct rm_class *cl)
471{
472	rm_class_t	*t = cl, *p;
473
474	/*
475	 * Recompute the depth for the branch of the tree.
476	 */
477	while (t != NULL) {
478		p = t->parent_;
479		if (p && (t->depth_ >= p->depth_)) {
480			p->depth_ = t->depth_ + 1;
481			t = p;
482		} else
483			t = NULL;
484	}
485}
486
487/*
488 * static void
489 * rmc_depth_recompute(struct rm_class *cl) - This function re-computes
490 *	the depth of the tree after a class has been deleted.
491 *
492 *	Returns:	NONE
493 */
494
495static void
496rmc_depth_recompute(rm_class_t *cl)
497{
498#if 1 /* ALTQ */
499	rm_class_t	*p, *t;
500
501	p = cl;
502	while (p != NULL) {
503		if ((t = p->children_) == NULL) {
504			p->depth_ = 0;
505		} else {
506			int cdepth = 0;
507
508			while (t != NULL) {
509				if (t->depth_ > cdepth)
510					cdepth = t->depth_;
511				t = t->next_;
512			}
513
514			if (p->depth_ == cdepth + 1)
515				/* no change to this parent */
516				return;
517
518			p->depth_ = cdepth + 1;
519		}
520
521		p = p->parent_;
522	}
523#else
524	rm_class_t	*t;
525
526	if (cl->depth_ >= 1) {
527		if (cl->children_ == NULL) {
528			cl->depth_ = 0;
529		} else if ((t = cl->children_) != NULL) {
530			while (t != NULL) {
531				if (t->children_ != NULL)
532					rmc_depth_recompute(t);
533				t = t->next_;
534			}
535		} else
536			rmc_depth_compute(cl);
537	}
538#endif
539}
540
541/*
542 * void
543 * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
544 *	function deletes a class from the link-sharing structure and frees
545 *	all resources associated with the class.
546 *
547 *	Returns: NONE
548 */
549
550void
551rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
552{
553	struct rm_class	*p, *head, *previous;
554	int		 s;
555
556	ASSERT(cl->children_ == NULL);
557
558	if (cl->sleeping_)
559		CALLOUT_STOP(&cl->callout_);
560
561#ifdef __NetBSD__
562	s = splnet();
563#else
564	s = splimp();
565#endif
566	IFQ_LOCK(ifd->ifq_);
567	/*
568	 * Free packets in the packet queue.
569	 * XXX - this may not be a desired behavior.  Packets should be
570	 *		re-queued.
571	 */
572	rmc_dropall(cl);
573
574	/*
575	 * If the class has a parent, then remove the class from the
576	 * class from the parent's children chain.
577	 */
578	if (cl->parent_ != NULL) {
579		head = cl->parent_->children_;
580		p = previous = head;
581		if (head->next_ == NULL) {
582			ASSERT(head == cl);
583			cl->parent_->children_ = NULL;
584			cl->parent_->leaf_ = 1;
585		} else while (p != NULL) {
586			if (p == cl) {
587				if (cl == head)
588					cl->parent_->children_ = cl->next_;
589				else
590					previous->next_ = cl->next_;
591				cl->next_ = NULL;
592				p = NULL;
593			} else {
594				previous = p;
595				p = p->next_;
596			}
597		}
598	}
599
600	/*
601	 * Delete class from class priority peer list.
602	 */
603	if ((p = ifd->active_[cl->pri_]) != NULL) {
604		/*
605		 * If there is more than one member of this priority
606		 * level, then look for class(cl) in the priority level.
607		 */
608		if (p != p->peer_) {
609			while (p->peer_ != cl)
610				p = p->peer_;
611			p->peer_ = cl->peer_;
612
613			if (ifd->active_[cl->pri_] == cl)
614				ifd->active_[cl->pri_] = cl->peer_;
615		} else {
616			ASSERT(p == cl);
617			ifd->active_[cl->pri_] = NULL;
618		}
619	}
620
621	/*
622	 * Recompute the WRR weights.
623	 */
624	if (ifd->wrr_) {
625		ifd->alloc_[cl->pri_] -= cl->allotment_;
626		ifd->num_[cl->pri_]--;
627		rmc_wrr_set_weights(ifd);
628	}
629
630	/*
631	 * Re-compute the depth of the tree.
632	 */
633#if 1 /* ALTQ */
634	rmc_depth_recompute(cl->parent_);
635#else
636	rmc_depth_recompute(ifd->root_);
637#endif
638
639	IFQ_UNLOCK(ifd->ifq_);
640	splx(s);
641
642	/*
643	 * Free the class structure.
644	 */
645	if (cl->red_ != NULL) {
646#ifdef ALTQ_RIO
647		if (q_is_rio(cl->q_))
648			rio_destroy((rio_t *)cl->red_);
649#endif
650#ifdef ALTQ_RED
651		if (q_is_red(cl->q_))
652			red_destroy(cl->red_);
653#endif
654	}
655	free(cl->q_, M_DEVBUF);
656	free(cl, M_DEVBUF);
657}
658
659
660/*
661 * void
662 * rmc_init(...) - Initialize the resource management data structures
663 *	associated with the output portion of interface 'ifp'.  'ifd' is
664 *	where the structures will be built (for backwards compatibility, the
665 *	structures aren't kept in the ifnet struct).  'nsecPerByte'
666 *	gives the link speed (inverse of bandwidth) in nanoseconds/byte.
667 *	'restart' is the driver-specific routine that the generic 'delay
668 *	until under limit' action will call to restart output.  `maxq'
669 *	is the queue size of the 'link' & 'default' classes.  'maxqueued'
670 *	is the maximum number of packets that the resource management
671 *	code will allow to be queued 'downstream' (this is typically 1).
672 *
673 *	Returns:	NONE
674 */
675
676void
677rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
678    void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
679    int minidle, u_int offtime, int flags)
680{
681	int		i, mtu;
682
683	/*
684	 * Initialize the CBQ tracing/debug facility.
685	 */
686	CBQTRACEINIT();
687
688	bzero((char *)ifd, sizeof (*ifd));
689	mtu = ifq->altq_ifp->if_mtu;
690	ifd->ifq_ = ifq;
691	ifd->restart = restart;
692	ifd->maxqueued_ = maxqueued;
693	ifd->ns_per_byte_ = nsecPerByte;
694	ifd->maxpkt_ = mtu;
695	ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
696	ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
697#if 1
698	ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
699	if (mtu * nsecPerByte > 10 * 1000000)
700		ifd->maxiftime_ /= 4;
701#endif
702
703	reset_cutoff(ifd);
704	CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
705
706	/*
707	 * Initialize the CBQ's WRR state.
708	 */
709	for (i = 0; i < RM_MAXPRIO; i++) {
710		ifd->alloc_[i] = 0;
711		ifd->M_[i] = 0;
712		ifd->num_[i] = 0;
713		ifd->na_[i] = 0;
714		ifd->active_[i] = NULL;
715	}
716
717	/*
718	 * Initialize current packet state.
719	 */
720	ifd->qi_ = 0;
721	ifd->qo_ = 0;
722	for (i = 0; i < RM_MAXQUEUED; i++) {
723		ifd->class_[i] = NULL;
724		ifd->curlen_[i] = 0;
725		ifd->borrowed_[i] = NULL;
726	}
727
728	/*
729	 * Create the root class of the link-sharing structure.
730	 */
731	if ((ifd->root_ = rmc_newclass(0, ifd,
732				       nsecPerByte,
733				       rmc_root_overlimit, maxq, 0, 0,
734				       maxidle, minidle, offtime,
735				       0, 0)) == NULL) {
736		printf("rmc_init: root class not allocated\n");
737		return ;
738	}
739	ifd->root_->depth_ = 0;
740}
741
742/*
743 * void
744 * rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
745 *	mbuf 'm' to queue for resource class 'cl'.  This routine is called
746 *	by a driver's if_output routine.  This routine must be called with
747 *	output packet completion interrupts locked out (to avoid racing with
748 *	rmc_dequeue_next).
749 *
750 *	Returns:	0 on successful queueing
751 *			-1 when packet drop occurs
752 */
753int
754rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
755{
756	struct timeval	 now;
757	struct rm_ifdat *ifd = cl->ifdat_;
758	int		 cpri = cl->pri_;
759	int		 is_empty = qempty(cl->q_);
760
761	RM_GETTIME(now);
762	if (ifd->cutoff_ > 0) {
763		if (TV_LT(&cl->undertime_, &now)) {
764			if (ifd->cutoff_ > cl->depth_)
765				ifd->cutoff_ = cl->depth_;
766			CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
767		}
768#if 1 /* ALTQ */
769		else {
770			/*
771			 * the class is overlimit. if the class has
772			 * underlimit ancestors, set cutoff to the lowest
773			 * depth among them.
774			 */
775			struct rm_class *borrow = cl->borrow_;
776
777			while (borrow != NULL &&
778			       borrow->depth_ < ifd->cutoff_) {
779				if (TV_LT(&borrow->undertime_, &now)) {
780					ifd->cutoff_ = borrow->depth_;
781					CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
782					break;
783				}
784				borrow = borrow->borrow_;
785			}
786		}
787#else /* !ALTQ */
788		else if ((ifd->cutoff_ > 1) && cl->borrow_) {
789			if (TV_LT(&cl->borrow_->undertime_, &now)) {
790				ifd->cutoff_ = cl->borrow_->depth_;
791				CBQTRACE(rmc_queue_packet, 'ffob',
792					 cl->borrow_->depth_);
793			}
794		}
795#endif /* !ALTQ */
796	}
797
798	if (_rmc_addq(cl, m) < 0)
799		/* failed */
800		return (-1);
801
802	if (is_empty) {
803		CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
804		ifd->na_[cpri]++;
805	}
806
807	if (qlen(cl->q_) > qlimit(cl->q_)) {
808		/* note: qlimit can be set to 0 or 1 */
809		rmc_drop_action(cl);
810		return (-1);
811	}
812	return (0);
813}
814
815/*
816 * void
817 * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
818 *	classes to see if there are satified.
819 */
820
821static void
822rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
823{
824	int		 i;
825	rm_class_t	*p, *bp;
826
827	for (i = RM_MAXPRIO - 1; i >= 0; i--) {
828		if ((bp = ifd->active_[i]) != NULL) {
829			p = bp;
830			do {
831				if (!rmc_satisfied(p, now)) {
832					ifd->cutoff_ = p->depth_;
833					return;
834				}
835				p = p->peer_;
836			} while (p != bp);
837		}
838	}
839
840	reset_cutoff(ifd);
841}
842
843/*
844 * rmc_satisfied - Return 1 of the class is satisfied.  O, otherwise.
845 */
846
847static int
848rmc_satisfied(struct rm_class *cl, struct timeval *now)
849{
850	rm_class_t	*p;
851
852	if (cl == NULL)
853		return (1);
854	if (TV_LT(now, &cl->undertime_))
855		return (1);
856	if (cl->depth_ == 0) {
857		if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
858			return (0);
859		else
860			return (1);
861	}
862	if (cl->children_ != NULL) {
863		p = cl->children_;
864		while (p != NULL) {
865			if (!rmc_satisfied(p, now))
866				return (0);
867			p = p->next_;
868		}
869	}
870
871	return (1);
872}
873
874/*
875 * Return 1 if class 'cl' is under limit or can borrow from a parent,
876 * 0 if overlimit.  As a side-effect, this routine will invoke the
877 * class overlimit action if the class if overlimit.
878 */
879
880static int
881rmc_under_limit(struct rm_class *cl, struct timeval *now)
882{
883	rm_class_t	*p = cl;
884	rm_class_t	*top;
885	struct rm_ifdat	*ifd = cl->ifdat_;
886
887	ifd->borrowed_[ifd->qi_] = NULL;
888	/*
889	 * If cl is the root class, then always return that it is
890	 * underlimit.  Otherwise, check to see if the class is underlimit.
891	 */
892	if (cl->parent_ == NULL)
893		return (1);
894
895	if (cl->sleeping_) {
896		if (TV_LT(now, &cl->undertime_))
897			return (0);
898
899		CALLOUT_STOP(&cl->callout_);
900		cl->sleeping_ = 0;
901		cl->undertime_.tv_sec = 0;
902		return (1);
903	}
904
905	top = NULL;
906	while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
907		if (((cl = cl->borrow_) == NULL) ||
908		    (cl->depth_ > ifd->cutoff_)) {
909#ifdef ADJUST_CUTOFF
910			if (cl != NULL)
911				/* cutoff is taking effect, just
912				   return false without calling
913				   the delay action. */
914				return (0);
915#endif
916#ifdef BORROW_OFFTIME
917			/*
918			 * check if the class can borrow offtime too.
919			 * borrow offtime from the top of the borrow
920			 * chain if the top class is not overloaded.
921			 */
922			if (cl != NULL) {
923				/* cutoff is taking effect, use this class as top. */
924				top = cl;
925				CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
926			}
927			if (top != NULL && top->avgidle_ == top->minidle_)
928				top = NULL;
929			p->overtime_ = *now;
930			(p->overlimit)(p, top);
931#else
932			p->overtime_ = *now;
933			(p->overlimit)(p, NULL);
934#endif
935			return (0);
936		}
937		top = cl;
938	}
939
940	if (cl != p)
941		ifd->borrowed_[ifd->qi_] = cl;
942	return (1);
943}
944
945/*
946 * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
947 *	Packet-by-packet round robin.
948 *
949 * The heart of the weighted round-robin scheduler, which decides which
950 * class next gets to send a packet.  Highest priority first, then
951 * weighted round-robin within priorites.
952 *
953 * Each able-to-send class gets to send until its byte allocation is
954 * exhausted.  Thus, the active pointer is only changed after a class has
955 * exhausted its allocation.
956 *
957 * If the scheduler finds no class that is underlimit or able to borrow,
958 * then the first class found that had a nonzero queue and is allowed to
959 * borrow gets to send.
960 */
961
962static mbuf_t *
963_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
964{
965	struct rm_class	*cl = NULL, *first = NULL;
966	u_int		 deficit;
967	int		 cpri;
968	mbuf_t		*m;
969	struct timeval	 now;
970
971	RM_GETTIME(now);
972
973	/*
974	 * if the driver polls the top of the queue and then removes
975	 * the polled packet, we must return the same packet.
976	 */
977	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
978		cl = ifd->pollcache_;
979		cpri = cl->pri_;
980		if (ifd->efficient_) {
981			/* check if this class is overlimit */
982			if (cl->undertime_.tv_sec != 0 &&
983			    rmc_under_limit(cl, &now) == 0)
984				first = cl;
985		}
986		ifd->pollcache_ = NULL;
987		goto _wrr_out;
988	}
989	else {
990		/* mode == ALTDQ_POLL || pollcache == NULL */
991		ifd->pollcache_ = NULL;
992		ifd->borrowed_[ifd->qi_] = NULL;
993	}
994#ifdef ADJUST_CUTOFF
995 _again:
996#endif
997	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
998		if (ifd->na_[cpri] == 0)
999			continue;
1000		deficit = 0;
1001		/*
1002		 * Loop through twice for a priority level, if some class
1003		 * was unable to send a packet the first round because
1004		 * of the weighted round-robin mechanism.
1005		 * During the second loop at this level, deficit==2.
1006		 * (This second loop is not needed if for every class,
1007		 * "M[cl->pri_])" times "cl->allotment" is greater than
1008		 * the byte size for the largest packet in the class.)
1009		 */
1010 _wrr_loop:
1011		cl = ifd->active_[cpri];
1012		ASSERT(cl != NULL);
1013		do {
1014			if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
1015				cl->bytes_alloc_ += cl->w_allotment_;
1016			if (!qempty(cl->q_)) {
1017				if ((cl->undertime_.tv_sec == 0) ||
1018				    rmc_under_limit(cl, &now)) {
1019					if (cl->bytes_alloc_ > 0 || deficit > 1)
1020						goto _wrr_out;
1021
1022					/* underlimit but no alloc */
1023					deficit = 1;
1024#if 1
1025					ifd->borrowed_[ifd->qi_] = NULL;
1026#endif
1027				}
1028				else if (first == NULL && cl->borrow_ != NULL)
1029					first = cl; /* borrowing candidate */
1030			}
1031
1032			cl->bytes_alloc_ = 0;
1033			cl = cl->peer_;
1034		} while (cl != ifd->active_[cpri]);
1035
1036		if (deficit == 1) {
1037			/* first loop found an underlimit class with deficit */
1038			/* Loop on same priority level, with new deficit.  */
1039			deficit = 2;
1040			goto _wrr_loop;
1041		}
1042	}
1043
1044#ifdef ADJUST_CUTOFF
1045	/*
1046	 * no underlimit class found.  if cutoff is taking effect,
1047	 * increase cutoff and try again.
1048	 */
1049	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1050		ifd->cutoff_++;
1051		CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
1052		goto _again;
1053	}
1054#endif /* ADJUST_CUTOFF */
1055	/*
1056	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1057	 * class we encounter will send a packet if all the classes
1058	 * of the link-sharing structure are overlimit.
1059	 */
1060	reset_cutoff(ifd);
1061	CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
1062
1063	if (!ifd->efficient_ || first == NULL)
1064		return (NULL);
1065
1066	cl = first;
1067	cpri = cl->pri_;
1068#if 0	/* too time-consuming for nothing */
1069	if (cl->sleeping_)
1070		CALLOUT_STOP(&cl->callout_);
1071	cl->sleeping_ = 0;
1072	cl->undertime_.tv_sec = 0;
1073#endif
1074	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1075	ifd->cutoff_ = cl->borrow_->depth_;
1076
1077	/*
1078	 * Deque the packet and do the book keeping...
1079	 */
1080 _wrr_out:
1081	if (op == ALTDQ_REMOVE) {
1082		m = _rmc_getq(cl);
1083		if (m == NULL)
1084			panic("_rmc_wrr_dequeue_next");
1085		if (qempty(cl->q_))
1086			ifd->na_[cpri]--;
1087
1088		/*
1089		 * Update class statistics and link data.
1090		 */
1091		if (cl->bytes_alloc_ > 0)
1092			cl->bytes_alloc_ -= m_pktlen(m);
1093
1094		if ((cl->bytes_alloc_ <= 0) || first == cl)
1095			ifd->active_[cl->pri_] = cl->peer_;
1096		else
1097			ifd->active_[cl->pri_] = cl;
1098
1099		ifd->class_[ifd->qi_] = cl;
1100		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1101		ifd->now_[ifd->qi_] = now;
1102		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1103		ifd->queued_++;
1104	} else {
1105		/* mode == ALTDQ_PPOLL */
1106		m = _rmc_pollq(cl);
1107		ifd->pollcache_ = cl;
1108	}
1109	return (m);
1110}
1111
1112/*
1113 * Dequeue & return next packet from the highest priority class that
1114 * has a packet to send & has enough allocation to send it.  This
1115 * routine is called by a driver whenever it needs a new packet to
1116 * output.
1117 */
1118static mbuf_t *
1119_rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
1120{
1121	mbuf_t		*m;
1122	int		 cpri;
1123	struct rm_class	*cl, *first = NULL;
1124	struct timeval	 now;
1125
1126	RM_GETTIME(now);
1127
1128	/*
1129	 * if the driver polls the top of the queue and then removes
1130	 * the polled packet, we must return the same packet.
1131	 */
1132	if (op == ALTDQ_REMOVE && ifd->pollcache_) {
1133		cl = ifd->pollcache_;
1134		cpri = cl->pri_;
1135		ifd->pollcache_ = NULL;
1136		goto _prr_out;
1137	} else {
1138		/* mode == ALTDQ_POLL || pollcache == NULL */
1139		ifd->pollcache_ = NULL;
1140		ifd->borrowed_[ifd->qi_] = NULL;
1141	}
1142#ifdef ADJUST_CUTOFF
1143 _again:
1144#endif
1145	for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
1146		if (ifd->na_[cpri] == 0)
1147			continue;
1148		cl = ifd->active_[cpri];
1149		ASSERT(cl != NULL);
1150		do {
1151			if (!qempty(cl->q_)) {
1152				if ((cl->undertime_.tv_sec == 0) ||
1153				    rmc_under_limit(cl, &now))
1154					goto _prr_out;
1155				if (first == NULL && cl->borrow_ != NULL)
1156					first = cl;
1157			}
1158			cl = cl->peer_;
1159		} while (cl != ifd->active_[cpri]);
1160	}
1161
1162#ifdef ADJUST_CUTOFF
1163	/*
1164	 * no underlimit class found.  if cutoff is taking effect, increase
1165	 * cutoff and try again.
1166	 */
1167	if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
1168		ifd->cutoff_++;
1169		goto _again;
1170	}
1171#endif /* ADJUST_CUTOFF */
1172	/*
1173	 * If LINK_EFFICIENCY is turned on, then the first overlimit
1174	 * class we encounter will send a packet if all the classes
1175	 * of the link-sharing structure are overlimit.
1176	 */
1177	reset_cutoff(ifd);
1178	if (!ifd->efficient_ || first == NULL)
1179		return (NULL);
1180
1181	cl = first;
1182	cpri = cl->pri_;
1183#if 0	/* too time-consuming for nothing */
1184	if (cl->sleeping_)
1185		CALLOUT_STOP(&cl->callout_);
1186	cl->sleeping_ = 0;
1187	cl->undertime_.tv_sec = 0;
1188#endif
1189	ifd->borrowed_[ifd->qi_] = cl->borrow_;
1190	ifd->cutoff_ = cl->borrow_->depth_;
1191
1192	/*
1193	 * Deque the packet and do the book keeping...
1194	 */
1195 _prr_out:
1196	if (op == ALTDQ_REMOVE) {
1197		m = _rmc_getq(cl);
1198		if (m == NULL)
1199			panic("_rmc_prr_dequeue_next");
1200		if (qempty(cl->q_))
1201			ifd->na_[cpri]--;
1202
1203		ifd->active_[cpri] = cl->peer_;
1204
1205		ifd->class_[ifd->qi_] = cl;
1206		ifd->curlen_[ifd->qi_] = m_pktlen(m);
1207		ifd->now_[ifd->qi_] = now;
1208		ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
1209		ifd->queued_++;
1210	} else {
1211		/* mode == ALTDQ_POLL */
1212		m = _rmc_pollq(cl);
1213		ifd->pollcache_ = cl;
1214	}
1215	return (m);
1216}
1217
1218/*
1219 * mbuf_t *
1220 * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
1221 *	is invoked by the packet driver to get the next packet to be
1222 *	dequeued and output on the link.  If WRR is enabled, then the
1223 *	WRR dequeue next routine will determine the next packet to sent.
1224 *	Otherwise, packet-by-packet round robin is invoked.
1225 *
1226 *	Returns:	NULL, if a packet is not available or if all
1227 *			classes are overlimit.
1228 *
1229 *			Otherwise, Pointer to the next packet.
1230 */
1231
1232mbuf_t *
1233rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
1234{
1235	if (ifd->queued_ >= ifd->maxqueued_)
1236		return (NULL);
1237	else if (ifd->wrr_)
1238		return (_rmc_wrr_dequeue_next(ifd, mode));
1239	else
1240		return (_rmc_prr_dequeue_next(ifd, mode));
1241}
1242
1243/*
1244 * Update the utilization estimate for the packet that just completed.
1245 * The packet's class & the parent(s) of that class all get their
1246 * estimators updated.  This routine is called by the driver's output-
1247 * packet-completion interrupt service routine.
1248 */
1249
1250/*
1251 * a macro to approximate "divide by 1000" that gives 0.000999,
1252 * if a value has enough effective digits.
1253 * (on pentium, mul takes 9 cycles but div takes 46!)
1254 */
1255#define	NSEC_TO_USEC(t)	(((t) >> 10) + ((t) >> 16) + ((t) >> 17))
1256void
1257rmc_update_class_util(struct rm_ifdat *ifd)
1258{
1259	int		 idle, avgidle, pktlen;
1260	int		 pkt_time, tidle;
1261	rm_class_t	*cl, *borrowed;
1262	rm_class_t	*borrows;
1263	struct timeval	*nowp;
1264
1265	/*
1266	 * Get the most recent completed class.
1267	 */
1268	if ((cl = ifd->class_[ifd->qo_]) == NULL)
1269		return;
1270
1271	pktlen = ifd->curlen_[ifd->qo_];
1272	borrowed = ifd->borrowed_[ifd->qo_];
1273	borrows = borrowed;
1274
1275	PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1276
1277	/*
1278	 * Run estimator on class and its ancestors.
1279	 */
1280	/*
1281	 * rm_update_class_util is designed to be called when the
1282	 * transfer is completed from a xmit complete interrupt,
1283	 * but most drivers don't implement an upcall for that.
1284	 * so, just use estimated completion time.
1285	 * as a result, ifd->qi_ and ifd->qo_ are always synced.
1286	 */
1287	nowp = &ifd->now_[ifd->qo_];
1288	/* get pkt_time (for link) in usec */
1289#if 1  /* use approximation */
1290	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
1291	pkt_time = NSEC_TO_USEC(pkt_time);
1292#else
1293	pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
1294#endif
1295#if 1 /* ALTQ4PPP */
1296	if (TV_LT(nowp, &ifd->ifnow_)) {
1297		int iftime;
1298
1299		/*
1300		 * make sure the estimated completion time does not go
1301		 * too far.  it can happen when the link layer supports
1302		 * data compression or the interface speed is set to
1303		 * a much lower value.
1304		 */
1305		TV_DELTA(&ifd->ifnow_, nowp, iftime);
1306		if (iftime+pkt_time < ifd->maxiftime_) {
1307			TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1308		} else {
1309			TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
1310		}
1311	} else {
1312		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1313	}
1314#else
1315	if (TV_LT(nowp, &ifd->ifnow_)) {
1316		TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
1317	} else {
1318		TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
1319	}
1320#endif
1321
1322	while (cl != NULL) {
1323		TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
1324		if (idle >= 2000000)
1325			/*
1326			 * this class is idle enough, reset avgidle.
1327			 * (TV_DELTA returns 2000000 us when delta is large.)
1328			 */
1329			cl->avgidle_ = cl->maxidle_;
1330
1331		/* get pkt_time (for class) in usec */
1332#if 1  /* use approximation */
1333		pkt_time = pktlen * cl->ns_per_byte_;
1334		pkt_time = NSEC_TO_USEC(pkt_time);
1335#else
1336		pkt_time = pktlen * cl->ns_per_byte_ / 1000;
1337#endif
1338		idle -= pkt_time;
1339
1340		avgidle = cl->avgidle_;
1341		avgidle += idle - (avgidle >> RM_FILTER_GAIN);
1342		cl->avgidle_ = avgidle;
1343
1344		/* Are we overlimit ? */
1345		if (avgidle <= 0) {
1346			CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
1347#if 1 /* ALTQ */
1348			/*
1349			 * need some lower bound for avgidle, otherwise
1350			 * a borrowing class gets unbounded penalty.
1351			 */
1352			if (avgidle < cl->minidle_)
1353				avgidle = cl->avgidle_ = cl->minidle_;
1354#endif
1355			/* set next idle to make avgidle 0 */
1356			tidle = pkt_time +
1357				(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
1358			TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
1359			++cl->stats_.over;
1360		} else {
1361			cl->avgidle_ =
1362			    (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
1363			cl->undertime_.tv_sec = 0;
1364			if (cl->sleeping_) {
1365				CALLOUT_STOP(&cl->callout_);
1366				cl->sleeping_ = 0;
1367			}
1368		}
1369
1370		if (borrows != NULL) {
1371			if (borrows != cl)
1372				++cl->stats_.borrows;
1373			else
1374				borrows = NULL;
1375		}
1376		cl->last_ = ifd->ifnow_;
1377		cl->last_pkttime_ = pkt_time;
1378
1379#if 1
1380		if (cl->parent_ == NULL) {
1381			/* take stats of root class */
1382			PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
1383		}
1384#endif
1385
1386		cl = cl->parent_;
1387	}
1388
1389	/*
1390	 * Check to see if cutoff needs to set to a new level.
1391	 */
1392	cl = ifd->class_[ifd->qo_];
1393	if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
1394#if 1 /* ALTQ */
1395		if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
1396			rmc_tl_satisfied(ifd, nowp);
1397			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1398		} else {
1399			ifd->cutoff_ = borrowed->depth_;
1400			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1401		}
1402#else /* !ALTQ */
1403		if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
1404			reset_cutoff(ifd);
1405#ifdef notdef
1406			rmc_tl_satisfied(ifd, &now);
1407#endif
1408			CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
1409		} else {
1410			ifd->cutoff_ = borrowed->depth_;
1411			CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
1412		}
1413#endif /* !ALTQ */
1414	}
1415
1416	/*
1417	 * Release class slot
1418	 */
1419	ifd->borrowed_[ifd->qo_] = NULL;
1420	ifd->class_[ifd->qo_] = NULL;
1421	ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
1422	ifd->queued_--;
1423}
1424
1425/*
1426 * void
1427 * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
1428 *	over-limit action routines.  These get invoked by rmc_under_limit()
1429 *	if a class with packets to send if over its bandwidth limit & can't
1430 *	borrow from a parent class.
1431 *
1432 *	Returns: NONE
1433 */
1434
1435static void
1436rmc_drop_action(struct rm_class *cl)
1437{
1438	struct rm_ifdat	*ifd = cl->ifdat_;
1439
1440	ASSERT(qlen(cl->q_) > 0);
1441	_rmc_dropq(cl);
1442	if (qempty(cl->q_))
1443		ifd->na_[cl->pri_]--;
1444}
1445
1446void rmc_dropall(struct rm_class *cl)
1447{
1448	struct rm_ifdat	*ifd = cl->ifdat_;
1449
1450	if (!qempty(cl->q_)) {
1451		_flushq(cl->q_);
1452
1453		ifd->na_[cl->pri_]--;
1454	}
1455}
1456
1457#if (__FreeBSD_version > 300000)
1458/* hzto() is removed from FreeBSD-3.0 */
1459static int hzto(struct timeval *);
1460
1461static int
1462hzto(tv)
1463	struct timeval *tv;
1464{
1465	struct timeval t2;
1466
1467	getmicrotime(&t2);
1468	t2.tv_sec = tv->tv_sec - t2.tv_sec;
1469	t2.tv_usec = tv->tv_usec - t2.tv_usec;
1470	return (tvtohz(&t2));
1471}
1472#endif /* __FreeBSD_version > 300000 */
1473
1474/*
1475 * void
1476 * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
1477 *	delay action routine.  It is invoked via rmc_under_limit when the
1478 *	packet is discoverd to be overlimit.
1479 *
1480 *	If the delay action is result of borrow class being overlimit, then
1481 *	delay for the offtime of the borrowing class that is overlimit.
1482 *
1483 *	Returns: NONE
1484 */
1485
1486void
1487rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
1488{
1489	int	delay, t, extradelay;
1490
1491	cl->stats_.overactions++;
1492	TV_DELTA(&cl->undertime_, &cl->overtime_, delay);
1493#ifndef BORROW_OFFTIME
1494	delay += cl->offtime_;
1495#endif
1496
1497	if (!cl->sleeping_) {
1498		CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
1499#ifdef BORROW_OFFTIME
1500		if (borrow != NULL)
1501			extradelay = borrow->offtime_;
1502		else
1503#endif
1504			extradelay = cl->offtime_;
1505
1506#ifdef ALTQ
1507		/*
1508		 * XXX recalculate suspend time:
1509		 * current undertime is (tidle + pkt_time) calculated
1510		 * from the last transmission.
1511		 *	tidle: time required to bring avgidle back to 0
1512		 *	pkt_time: target waiting time for this class
1513		 * we need to replace pkt_time by offtime
1514		 */
1515		extradelay -= cl->last_pkttime_;
1516#endif
1517		if (extradelay > 0) {
1518			TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
1519			delay += extradelay;
1520		}
1521
1522		cl->sleeping_ = 1;
1523		cl->stats_.delays++;
1524
1525		/*
1526		 * Since packets are phased randomly with respect to the
1527		 * clock, 1 tick (the next clock tick) can be an arbitrarily
1528		 * short time so we have to wait for at least two ticks.
1529		 * NOTE:  If there's no other traffic, we need the timer as
1530		 * a 'backstop' to restart this class.
1531		 */
1532		if (delay > tick * 2) {
1533#ifdef __FreeBSD__
1534			/* FreeBSD rounds up the tick */
1535			t = hzto(&cl->undertime_);
1536#else
1537			/* other BSDs round down the tick */
1538			t = hzto(&cl->undertime_) + 1;
1539#endif
1540		} else
1541			t = 2;
1542		CALLOUT_RESET(&cl->callout_, t,
1543			      (timeout_t *)rmc_restart, (caddr_t)cl);
1544	}
1545}
1546
1547/*
1548 * void
1549 * rmc_restart() - is just a helper routine for rmc_delay_action -- it is
1550 *	called by the system timer code & is responsible checking if the
1551 *	class is still sleeping (it might have been restarted as a side
1552 *	effect of the queue scan on a packet arrival) and, if so, restarting
1553 *	output for the class.  Inspecting the class state & restarting output
1554 *	require locking the class structure.  In general the driver is
1555 *	responsible for locking but this is the only routine that is not
1556 *	called directly or indirectly from the interface driver so it has
1557 *	know about system locking conventions.  Under bsd, locking is done
1558 *	by raising IPL to splimp so that's what's implemented here.  On a
1559 *	different system this would probably need to be changed.
1560 *
1561 *	Returns:	NONE
1562 */
1563
1564static void
1565rmc_restart(struct rm_class *cl)
1566{
1567	struct rm_ifdat	*ifd = cl->ifdat_;
1568	int		 s;
1569
1570#ifdef __NetBSD__
1571	s = splnet();
1572#else
1573	s = splimp();
1574#endif
1575	IFQ_LOCK(ifd->ifq_);
1576	if (cl->sleeping_) {
1577		cl->sleeping_ = 0;
1578		cl->undertime_.tv_sec = 0;
1579
1580		if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
1581			CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
1582			(ifd->restart)(ifd->ifq_);
1583		}
1584	}
1585	IFQ_UNLOCK(ifd->ifq_);
1586	splx(s);
1587}
1588
1589/*
1590 * void
1591 * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
1592 *	handling routine for the root class of the link sharing structure.
1593 *
1594 *	Returns: NONE
1595 */
1596
1597static void
1598rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow)
1599{
1600    panic("rmc_root_overlimit");
1601}
1602
1603/*
1604 * Packet Queue handling routines.  Eventually, this is to localize the
1605 *	effects on the code whether queues are red queues or droptail
1606 *	queues.
1607 */
1608
1609static int
1610_rmc_addq(rm_class_t *cl, mbuf_t *m)
1611{
1612#ifdef ALTQ_RIO
1613	if (q_is_rio(cl->q_))
1614		return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
1615#endif
1616#ifdef ALTQ_RED
1617	if (q_is_red(cl->q_))
1618		return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
1619#endif /* ALTQ_RED */
1620
1621	if (cl->flags_ & RMCF_CLEARDSCP)
1622		write_dsfield(m, cl->pktattr_, 0);
1623
1624	_addq(cl->q_, m);
1625	return (0);
1626}
1627
1628/* note: _rmc_dropq is not called for red */
1629static void
1630_rmc_dropq(rm_class_t *cl)
1631{
1632	mbuf_t	*m;
1633
1634	if ((m = _getq(cl->q_)) != NULL)
1635		m_freem(m);
1636}
1637
1638static mbuf_t *
1639_rmc_getq(rm_class_t *cl)
1640{
1641#ifdef ALTQ_RIO
1642	if (q_is_rio(cl->q_))
1643		return rio_getq((rio_t *)cl->red_, cl->q_);
1644#endif
1645#ifdef ALTQ_RED
1646	if (q_is_red(cl->q_))
1647		return red_getq(cl->red_, cl->q_);
1648#endif
1649	return _getq(cl->q_);
1650}
1651
1652static mbuf_t *
1653_rmc_pollq(rm_class_t *cl)
1654{
1655	return qhead(cl->q_);
1656}
1657
1658#ifdef CBQ_TRACE
1659
1660struct cbqtrace		 cbqtrace_buffer[NCBQTRACE+1];
1661struct cbqtrace		*cbqtrace_ptr = NULL;
1662int			 cbqtrace_count;
1663
1664/*
1665 * DDB hook to trace cbq events:
1666 *  the last 1024 events are held in a circular buffer.
1667 *  use "call cbqtrace_dump(N)" to display 20 events from Nth event.
1668 */
1669void cbqtrace_dump(int);
1670static char *rmc_funcname(void *);
1671
1672static struct rmc_funcs {
1673	void	*func;
1674	char	*name;
1675} rmc_funcs[] =
1676{
1677	rmc_init,		"rmc_init",
1678	rmc_queue_packet,	"rmc_queue_packet",
1679	rmc_under_limit,	"rmc_under_limit",
1680	rmc_update_class_util,	"rmc_update_class_util",
1681	rmc_delay_action,	"rmc_delay_action",
1682	rmc_restart,		"rmc_restart",
1683	_rmc_wrr_dequeue_next,	"_rmc_wrr_dequeue_next",
1684	NULL,			NULL
1685};
1686
1687static char *rmc_funcname(void *func)
1688{
1689	struct rmc_funcs *fp;
1690
1691	for (fp = rmc_funcs; fp->func != NULL; fp++)
1692		if (fp->func == func)
1693			return (fp->name);
1694	return ("unknown");
1695}
1696
1697void cbqtrace_dump(int counter)
1698{
1699	int	 i, *p;
1700	char	*cp;
1701
1702	counter = counter % NCBQTRACE;
1703	p = (int *)&cbqtrace_buffer[counter];
1704
1705	for (i=0; i<20; i++) {
1706		printf("[0x%x] ", *p++);
1707		printf("%s: ", rmc_funcname((void *)*p++));
1708		cp = (char *)p++;
1709		printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
1710		printf("%d\n",*p++);
1711
1712		if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
1713			p = (int *)cbqtrace_buffer;
1714	}
1715}
1716#endif /* CBQ_TRACE */
1717#endif /* ALTQ_CBQ */
1718
1719#if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ)
1720#if !defined(__GNUC__) || defined(ALTQ_DEBUG)
1721
1722void
1723_addq(class_queue_t *q, mbuf_t *m)
1724{
1725        mbuf_t	*m0;
1726
1727	if ((m0 = qtail(q)) != NULL)
1728		m->m_nextpkt = m0->m_nextpkt;
1729	else
1730		m0 = m;
1731	m0->m_nextpkt = m;
1732	qtail(q) = m;
1733	qlen(q)++;
1734}
1735
1736mbuf_t *
1737_getq(class_queue_t *q)
1738{
1739	mbuf_t	*m, *m0;
1740
1741	if ((m = qtail(q)) == NULL)
1742		return (NULL);
1743	if ((m0 = m->m_nextpkt) != m)
1744		m->m_nextpkt = m0->m_nextpkt;
1745	else {
1746		ASSERT(qlen(q) == 1);
1747		qtail(q) = NULL;
1748	}
1749	qlen(q)--;
1750	m0->m_nextpkt = NULL;
1751	return (m0);
1752}
1753
1754/* drop a packet at the tail of the queue */
1755mbuf_t *
1756_getq_tail(class_queue_t *q)
1757{
1758	mbuf_t	*m, *m0, *prev;
1759
1760	if ((m = m0 = qtail(q)) == NULL)
1761		return NULL;
1762	do {
1763		prev = m0;
1764		m0 = m0->m_nextpkt;
1765	} while (m0 != m);
1766	prev->m_nextpkt = m->m_nextpkt;
1767	if (prev == m)  {
1768		ASSERT(qlen(q) == 1);
1769		qtail(q) = NULL;
1770	} else
1771		qtail(q) = prev;
1772	qlen(q)--;
1773	m->m_nextpkt = NULL;
1774	return (m);
1775}
1776
1777/* randomly select a packet in the queue */
1778mbuf_t *
1779_getq_random(class_queue_t *q)
1780{
1781	struct mbuf	*m;
1782	int		 i, n;
1783
1784	if ((m = qtail(q)) == NULL)
1785		return NULL;
1786	if (m->m_nextpkt == m) {
1787		ASSERT(qlen(q) == 1);
1788		qtail(q) = NULL;
1789	} else {
1790		struct mbuf *prev = NULL;
1791
1792		n = arc4random() % qlen(q) + 1;
1793		for (i = 0; i < n; i++) {
1794			prev = m;
1795			m = m->m_nextpkt;
1796		}
1797		prev->m_nextpkt = m->m_nextpkt;
1798		if (m == qtail(q))
1799			qtail(q) = prev;
1800	}
1801	qlen(q)--;
1802	m->m_nextpkt = NULL;
1803	return (m);
1804}
1805
1806void
1807_removeq(class_queue_t *q, mbuf_t *m)
1808{
1809	mbuf_t	*m0, *prev;
1810
1811	m0 = qtail(q);
1812	do {
1813		prev = m0;
1814		m0 = m0->m_nextpkt;
1815	} while (m0 != m);
1816	prev->m_nextpkt = m->m_nextpkt;
1817	if (prev == m)
1818		qtail(q) = NULL;
1819	else if (qtail(q) == m)
1820		qtail(q) = prev;
1821	qlen(q)--;
1822}
1823
1824void
1825_flushq(class_queue_t *q)
1826{
1827	mbuf_t *m;
1828
1829	while ((m = _getq(q)) != NULL)
1830		m_freem(m);
1831	ASSERT(qlen(q) == 0);
1832}
1833
1834#endif /* !__GNUC__ || ALTQ_DEBUG */
1835#endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
1836