1/*
2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa
30 * Portions Copyright (c) 2000 Akamba Corp.
31 * All rights reserved
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 *    notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 *    notice, this list of conditions and the following disclaimer in the
40 *    documentation and/or other materials provided with the distribution.
41 *
42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
45 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * SUCH DAMAGE.
53 *
54 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $
55 */
56
57#define	DUMMYNET_DEBUG
58
59/*
60 * This module implements IP dummynet, a bandwidth limiter/delay emulator
61 * used in conjunction with the ipfw package.
62 * Description of the data structures used is in ip_dummynet.h
63 * Here you mainly find the following blocks of code:
64 *  + variable declarations;
65 *  + heap management functions;
66 *  + scheduler and dummynet functions;
67 *  + configuration and initialization.
68 *
69 * NOTA BENE: critical sections are protected by the "dummynet lock".
70 *
71 * Most important Changes:
72 *
73 * 010124: Fixed WF2Q behaviour
74 * 010122: Fixed spl protection.
75 * 000601: WF2Q support
76 * 000106: large rewrite, use heaps to handle very many pipes.
77 * 980513:	initial release
78 *
79 * include files marked with XXX are probably not needed
80 */
81
82#include <sys/param.h>
83#include <sys/systm.h>
84#include <sys/malloc.h>
85#include <sys/mbuf.h>
86#include <sys/queue.h>			/* XXX */
87#include <sys/kernel.h>
88#include <sys/socket.h>
89#include <sys/socketvar.h>
90#include <sys/time.h>
91#include <sys/sysctl.h>
92#include <net/if.h>
93#include <net/route.h>
94#include <net/kpi_protocol.h>
95#include <netinet/in.h>
96#include <netinet/in_systm.h>
97#include <netinet/in_var.h>
98#include <netinet/ip.h>
99#include <netinet/ip_fw.h>
100#include <netinet/ip_dummynet.h>
101#include <netinet/ip_var.h>
102
103#if BRIDGE
104#include <netinet/if_ether.h> /* for struct arpcom */
105#include <net/bridge.h>
106#endif
107
108/*
109 * We keep a private variable for the simulation time, but we could
110 * probably use an existing one ("softticks" in sys/kern/kern_timer.c)
111 */
112static dn_key curr_time = 0 ; /* current simulation time */
113
114/* this is for the timer that fires to call dummynet() - we only enable the timer when
115	there are packets to process, otherwise it's disabled */
116static int timer_enabled = 0;
117
118static int dn_hash_size = 64 ;	/* default hash size */
119
120/* statistics on number of queue searches and search steps */
121static int searches, search_steps ;
122static int pipe_expire = 1 ;   /* expire queue if empty */
123static int dn_max_ratio = 16 ; /* max queues/buckets ratio */
124
125static int red_lookup_depth = 256;	/* RED - default lookup table depth */
126static int red_avg_pkt_size = 512;      /* RED - default medium packet size */
127static int red_max_pkt_size = 1500;     /* RED - default max packet size */
128
129/*
130 * Three heaps contain queues and pipes that the scheduler handles:
131 *
132 * ready_heap contains all dn_flow_queue related to fixed-rate pipes.
133 *
134 * wfq_ready_heap contains the pipes associated with WF2Q flows
135 *
136 * extract_heap contains pipes associated with delay lines.
137 *
138 */
139static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ;
140
141static int heap_init(struct dn_heap *h, int size) ;
142static int heap_insert (struct dn_heap *h, dn_key key1, void *p);
143static void heap_extract(struct dn_heap *h, void *obj);
144
145static void transmit_event(struct dn_pipe *pipe);
146static void ready_event(struct dn_flow_queue *q);
147
148static struct dn_pipe *all_pipes = NULL ;	/* list of all pipes */
149static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */
150
151#ifdef SYSCTL_NODE
152SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet,
153		CTLFLAG_RW, 0, "Dummynet");
154SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size,
155	    CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size");
156SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time,
157	    CTLFLAG_RD, &curr_time, 0, "Current tick");
158SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap,
159	    CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap");
160SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap,
161	    CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap");
162SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches,
163	    CTLFLAG_RD, &searches, 0, "Number of queue searches");
164SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps,
165	    CTLFLAG_RD, &search_steps, 0, "Number of queue search steps");
166SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire,
167	    CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty");
168SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len,
169	    CTLFLAG_RW, &dn_max_ratio, 0,
170	"Max ratio between dynamic queues and buckets");
171SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth,
172	CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table");
173SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size,
174	CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size");
175SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size,
176	CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size");
177#endif
178
179#ifdef DUMMYNET_DEBUG
180int	dummynet_debug = 0;
181#ifdef SYSCTL_NODE
182SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug,
183	    0, "control debugging printfs");
184#endif
185#define	DPRINTF(X)	if (dummynet_debug) printf X
186#else
187#define	DPRINTF(X)
188#endif
189
190/* contrary to the comment above random(), it does not actually
191 * return a value [0, 2^31 - 1], which breaks plr amongst other
192 * things. Masking it should work even if the behavior of
193 * the function is fixed.
194 */
195#define MY_RANDOM (random() & 0x7FFFFFFF)
196
197/* dummynet lock */
198lck_grp_t         *dn_mutex_grp;
199lck_grp_attr_t    *dn_mutex_grp_attr;
200lck_attr_t        *dn_mutex_attr;
201lck_mtx_t         *dn_mutex;
202
203static int config_pipe(struct dn_pipe *p);
204static int ip_dn_ctl(struct sockopt *sopt);
205
206static void dummynet(void *);
207static void dummynet_flush(void);
208void dummynet_drain(void);
209static ip_dn_io_t dummynet_io;
210static void dn_rule_delete(void *);
211
212int if_tx_rdy(struct ifnet *ifp);
213
214/*
215 * Heap management functions.
216 *
217 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2.
218 * Some macros help finding parent/children so we can optimize them.
219 *
220 * heap_init() is called to expand the heap when needed.
221 * Increment size in blocks of 16 entries.
222 * XXX failure to allocate a new element is a pretty bad failure
223 * as we basically stall a whole queue forever!!
224 * Returns 1 on error, 0 on success
225 */
226#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 )
227#define HEAP_LEFT(x) ( 2*(x) + 1 )
228#define HEAP_IS_LEFT(x) ( (x) & 1 )
229#define HEAP_RIGHT(x) ( 2*(x) + 2 )
230#define	HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; }
231#define HEAP_INCREMENT	15
232
233static int
234heap_init(struct dn_heap *h, int new_size)
235{
236    struct dn_heap_entry *p;
237
238    if (h->size >= new_size ) {
239	printf("dummynet: heap_init, Bogus call, have %d want %d\n",
240		h->size, new_size);
241	return 0 ;
242    }
243    new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ;
244    p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT );
245    if (p == NULL) {
246	printf("dummynet: heap_init, resize %d failed\n", new_size );
247	return 1 ; /* error */
248    }
249    if (h->size > 0) {
250	bcopy(h->p, p, h->size * sizeof(*p) );
251	FREE(h->p, M_DUMMYNET);
252    }
253    h->p = p ;
254    h->size = new_size ;
255    return 0 ;
256}
257
258/*
259 * Insert element in heap. Normally, p != NULL, we insert p in
260 * a new position and bubble up. If p == NULL, then the element is
261 * already in place, and key is the position where to start the
262 * bubble-up.
263 * Returns 1 on failure (cannot allocate new heap entry)
264 *
265 * If offset > 0 the position (index, int) of the element in the heap is
266 * also stored in the element itself at the given offset in bytes.
267 */
268#define SET_OFFSET(heap, node) \
269    if (heap->offset > 0) \
270	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ;
271/*
272 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value.
273 */
274#define RESET_OFFSET(heap, node) \
275    if (heap->offset > 0) \
276	    *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ;
277static int
278heap_insert(struct dn_heap *h, dn_key key1, void *p)
279{
280    int son = h->elements ;
281
282    if (p == NULL)	/* data already there, set starting point */
283	son = key1 ;
284    else {		/* insert new element at the end, possibly resize */
285	son = h->elements ;
286	if (son == h->size) /* need resize... */
287	    if (heap_init(h, h->elements+1) )
288		return 1 ; /* failure... */
289	h->p[son].object = p ;
290	h->p[son].key = key1 ;
291	h->elements++ ;
292    }
293    while (son > 0) {				/* bubble up */
294	int father = HEAP_FATHER(son) ;
295	struct dn_heap_entry tmp  ;
296
297	if (DN_KEY_LT( h->p[father].key, h->p[son].key ) )
298	    break ; /* found right position */
299	/* son smaller than father, swap and repeat */
300	HEAP_SWAP(h->p[son], h->p[father], tmp) ;
301	SET_OFFSET(h, son);
302	son = father ;
303    }
304    SET_OFFSET(h, son);
305    return 0 ;
306}
307
308/*
309 * remove top element from heap, or obj if obj != NULL
310 */
311static void
312heap_extract(struct dn_heap *h, void *obj)
313{
314    int child, father, maxelt = h->elements - 1 ;
315
316    if (maxelt < 0) {
317	printf("dummynet: warning, extract from empty heap 0x%p\n", h);
318	return ;
319    }
320    father = 0 ; /* default: move up smallest child */
321    if (obj != NULL) { /* extract specific element, index is at offset */
322	if (h->offset <= 0)
323	    panic("dummynet: heap_extract from middle not supported on this heap!!!\n");
324	father = *((int *)((char *)obj + h->offset)) ;
325	if (father < 0 || father >= h->elements) {
326	    printf("dummynet: heap_extract, father %d out of bound 0..%d\n",
327		father, h->elements);
328	    panic("dummynet: heap_extract");
329	}
330    }
331    RESET_OFFSET(h, father);
332    child = HEAP_LEFT(father) ;		/* left child */
333    while (child <= maxelt) {		/* valid entry */
334	if (child != maxelt && DN_KEY_LT(h->p[child+1].key, h->p[child].key) )
335	    child = child+1 ;		/* take right child, otherwise left */
336	h->p[father] = h->p[child] ;
337	SET_OFFSET(h, father);
338	father = child ;
339	child = HEAP_LEFT(child) ;   /* left child for next loop */
340    }
341    h->elements-- ;
342    if (father != maxelt) {
343	/*
344	 * Fill hole with last entry and bubble up, reusing the insert code
345	 */
346	h->p[father] = h->p[maxelt] ;
347	heap_insert(h, father, NULL); /* this one cannot fail */
348    }
349}
350
351#if 0
352/*
353 * change object position and update references
354 * XXX this one is never used!
355 */
356static void
357heap_move(struct dn_heap *h, dn_key new_key, void *object)
358{
359    int temp;
360    int i ;
361    int maxelt = h->elements-1 ;
362    struct dn_heap_entry buf ;
363
364    if (h->offset <= 0)
365	panic("cannot move items on this heap");
366
367    i = *((int *)((char *)object + h->offset));
368    if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */
369	h->p[i].key = new_key ;
370	for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ;
371		 i = temp ) { /* bubble up */
372	    HEAP_SWAP(h->p[i], h->p[temp], buf) ;
373	    SET_OFFSET(h, i);
374	}
375    } else {		/* must move down */
376	h->p[i].key = new_key ;
377	while ( (temp = HEAP_LEFT(i)) <= maxelt ) { /* found left child */
378	    if ((temp != maxelt) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key))
379		temp++ ; /* select child with min key */
380	    if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */
381		HEAP_SWAP(h->p[i], h->p[temp], buf) ;
382		SET_OFFSET(h, i);
383	    } else
384		break ;
385	    i = temp ;
386	}
387    }
388    SET_OFFSET(h, i);
389}
390#endif /* heap_move, unused */
391
392/*
393 * heapify() will reorganize data inside an array to maintain the
394 * heap property. It is needed when we delete a bunch of entries.
395 */
396static void
397heapify(struct dn_heap *h)
398{
399    int i ;
400
401    for (i = 0 ; i < h->elements ; i++ )
402	heap_insert(h, i , NULL) ;
403}
404
405/*
406 * cleanup the heap and free data structure
407 */
408static void
409heap_free(struct dn_heap *h)
410{
411    if (h->size >0 )
412	FREE(h->p, M_DUMMYNET);
413    bzero(h, sizeof(*h));
414}
415
416/*
417 * --- end of heap management functions ---
418 */
419
420/*
421 * Return the mbuf tag holding the dummynet state.  As an optimization
422 * this is assumed to be the first tag on the list.  If this turns out
423 * wrong we'll need to search the list.
424 */
425static struct dn_pkt_tag *
426dn_tag_get(struct mbuf *m)
427{
428    struct m_tag *mtag = m_tag_first(m);
429/*	KASSERT(mtag != NULL &&
430	    mtag->m_tag_id == KERNEL_MODULE_TAG_ID &&
431	    mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET,
432	    ("packet on dummynet queue w/o dummynet tag!"));
433*/
434    return (struct dn_pkt_tag *)(mtag+1);
435}
436
437/*
438 * Scheduler functions:
439 *
440 * transmit_event() is called when the delay-line needs to enter
441 * the scheduler, either because of existing pkts getting ready,
442 * or new packets entering the queue. The event handled is the delivery
443 * time of the packet.
444 *
445 * ready_event() does something similar with fixed-rate queues, and the
446 * event handled is the finish time of the head pkt.
447 *
448 * wfq_ready_event() does something similar with WF2Q queues, and the
449 * event handled is the start time of the head pkt.
450 *
451 * In all cases, we make sure that the data structures are consistent
452 * before passing pkts out, because this might trigger recursive
453 * invocations of the procedures.
454 */
455static void
456transmit_event(struct dn_pipe *pipe)
457{
458    struct mbuf *m ;
459    struct dn_pkt_tag *pkt ;
460
461	lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
462
463    while ( (m = pipe->head) ) {
464		pkt = dn_tag_get(m);
465		if ( !DN_KEY_LEQ(pkt->output_time, curr_time) )
466			break;
467		/*
468		 * first unlink, then call procedures, since ip_input() can invoke
469		 * ip_output() and viceversa, thus causing nested calls
470		 */
471		pipe->head = m->m_nextpkt ;
472		m->m_nextpkt = NULL;
473
474		/* XXX: drop the lock for now to avoid LOR's */
475		lck_mtx_unlock(dn_mutex);
476		switch (pkt->dn_dir) {
477			case DN_TO_IP_OUT: {
478				struct route tmp_rt = pkt->ro;
479				(void)ip_output(m, NULL, NULL, pkt->flags, NULL, NULL);
480				if (tmp_rt.ro_rt) {
481					rtfree(tmp_rt.ro_rt);
482					tmp_rt.ro_rt = NULL;
483				}
484				break ;
485			}
486			case DN_TO_IP_IN :
487				proto_inject(PF_INET, m);
488				break ;
489
490#if BRIDGE
491			case DN_TO_BDG_FWD :
492				/*
493				 * The bridge requires/assumes the Ethernet header is
494				 * contiguous in the first mbuf header.  Insure this is true.
495				 */
496				if (BDG_LOADED) {
497				if (m->m_len < ETHER_HDR_LEN &&
498					(m = m_pullup(m, ETHER_HDR_LEN)) == NULL) {
499					printf("dummynet/bridge: pullup fail, dropping pkt\n");
500					break;
501				}
502				m = bdg_forward_ptr(m, pkt->ifp);
503				} else {
504				/* somebody unloaded the bridge module. Drop pkt */
505				/* XXX rate limit */
506				printf("dummynet: dropping bridged packet trapped in pipe\n");
507				}
508				if (m)
509				m_freem(m);
510				break;
511#endif
512			default:
513				printf("dummynet: bad switch %d!\n", pkt->dn_dir);
514				m_freem(m);
515				break ;
516		}
517		lck_mtx_lock(dn_mutex);
518    }
519    /* if there are leftover packets, put into the heap for next event */
520    if ( (m = pipe->head) ) {
521		pkt = dn_tag_get(m);
522		/* XXX should check errors on heap_insert, by draining the
523		 * whole pipe p and hoping in the future we are more successful
524		 */
525		heap_insert(&extract_heap, pkt->output_time, pipe);
526    }
527}
528
529/*
530 * the following macro computes how many ticks we have to wait
531 * before being able to transmit a packet. The credit is taken from
532 * either a pipe (WF2Q) or a flow_queue (per-flow queueing)
533 */
534
535/* hz is 100, which gives a granularity of 10ms in the old timer.
536 * The timer has been changed to fire every 1ms, so the use of
537 * hz has been modified here. All instances of hz have been left
538 * in place but adjusted by a factor of 10 so that hz is functionally
539 * equal to 1000.
540 */
541#define SET_TICKS(_m, q, p)	\
542    ((_m)->m_pkthdr.len*8*(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \
543	    p->bandwidth ;
544
545/*
546 * extract pkt from queue, compute output time (could be now)
547 * and put into delay line (p_queue)
548 */
549static void
550move_pkt(struct mbuf *pkt, struct dn_flow_queue *q,
551	struct dn_pipe *p, int len)
552{
553    struct dn_pkt_tag *dt = dn_tag_get(pkt);
554
555    q->head = pkt->m_nextpkt ;
556    q->len-- ;
557    q->len_bytes -= len ;
558
559    dt->output_time = curr_time + p->delay ;
560
561    if (p->head == NULL)
562	p->head = pkt;
563    else
564	p->tail->m_nextpkt = pkt;
565    p->tail = pkt;
566    p->tail->m_nextpkt = NULL;
567}
568
569/*
570 * ready_event() is invoked every time the queue must enter the
571 * scheduler, either because the first packet arrives, or because
572 * a previously scheduled event fired.
573 * On invokation, drain as many pkts as possible (could be 0) and then
574 * if there are leftover packets reinsert the pkt in the scheduler.
575 */
576static void
577ready_event(struct dn_flow_queue *q)
578{
579    struct mbuf *pkt;
580    struct dn_pipe *p = q->fs->pipe ;
581    int p_was_empty ;
582
583	lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
584
585    if (p == NULL) {
586	printf("dummynet: ready_event- pipe is gone\n");
587	return ;
588    }
589    p_was_empty = (p->head == NULL) ;
590
591    /*
592     * schedule fixed-rate queues linked to this pipe:
593     * Account for the bw accumulated since last scheduling, then
594     * drain as many pkts as allowed by q->numbytes and move to
595     * the delay line (in p) computing output time.
596     * bandwidth==0 (no limit) means we can drain the whole queue,
597     * setting len_scaled = 0 does the job.
598     */
599    q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth;
600    while ( (pkt = q->head) != NULL ) {
601	int len = pkt->m_pkthdr.len;
602	int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ;
603	if (len_scaled > q->numbytes )
604	    break ;
605	q->numbytes -= len_scaled ;
606	move_pkt(pkt, q, p, len);
607    }
608    /*
609     * If we have more packets queued, schedule next ready event
610     * (can only occur when bandwidth != 0, otherwise we would have
611     * flushed the whole queue in the previous loop).
612     * To this purpose we record the current time and compute how many
613     * ticks to go for the finish time of the packet.
614     */
615    if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */
616	dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */
617	q->sched_time = curr_time ;
618	heap_insert(&ready_heap, curr_time + t, (void *)q );
619	/* XXX should check errors on heap_insert, and drain the whole
620	 * queue on error hoping next time we are luckier.
621	 */
622    } else {	/* RED needs to know when the queue becomes empty */
623	q->q_time = curr_time;
624	q->numbytes = 0;
625    }
626    /*
627     * If the delay line was empty call transmit_event(p) now.
628     * Otherwise, the scheduler will take care of it.
629     */
630    if (p_was_empty)
631	transmit_event(p);
632}
633
634/*
635 * Called when we can transmit packets on WF2Q queues. Take pkts out of
636 * the queues at their start time, and enqueue into the delay line.
637 * Packets are drained until p->numbytes < 0. As long as
638 * len_scaled >= p->numbytes, the packet goes into the delay line
639 * with a deadline p->delay. For the last packet, if p->numbytes<0,
640 * there is an additional delay.
641 */
642static void
643ready_event_wfq(struct dn_pipe *p)
644{
645    int p_was_empty = (p->head == NULL) ;
646    struct dn_heap *sch = &(p->scheduler_heap);
647    struct dn_heap *neh = &(p->not_eligible_heap) ;
648
649	lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
650
651    if (p->if_name[0] == 0) /* tx clock is simulated */
652	p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth;
653    else { /* tx clock is for real, the ifq must be empty or this is a NOP */
654	if (p->ifp && p->ifp->if_snd.ifq_head != NULL)
655	    return ;
656	else {
657	    DPRINTF(("dummynet: pipe %d ready from %s --\n",
658		p->pipe_nr, p->if_name));
659	}
660    }
661
662    /*
663     * While we have backlogged traffic AND credit, we need to do
664     * something on the queue.
665     */
666    while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) {
667	if (sch->elements > 0) { /* have some eligible pkts to send out */
668	    struct dn_flow_queue *q = sch->p[0].object ;
669	    struct mbuf *pkt = q->head;
670	    struct dn_flow_set *fs = q->fs;
671	    u_int64_t len = pkt->m_pkthdr.len;
672	    int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ;
673
674	    heap_extract(sch, NULL); /* remove queue from heap */
675	    p->numbytes -= len_scaled ;
676	    move_pkt(pkt, q, p, len);
677
678	    p->V += (len<<MY_M) / p->sum ; /* update V */
679	    q->S = q->F ; /* update start time */
680	    if (q->len == 0) { /* Flow not backlogged any more */
681		fs->backlogged-- ;
682		heap_insert(&(p->idle_heap), q->F, q);
683	    } else { /* still backlogged */
684		/*
685		 * update F and position in backlogged queue, then
686		 * put flow in not_eligible_heap (we will fix this later).
687		 */
688		len = (q->head)->m_pkthdr.len;
689		q->F += (len<<MY_M)/(u_int64_t) fs->weight ;
690		if (DN_KEY_LEQ(q->S, p->V))
691		    heap_insert(neh, q->S, q);
692		else
693		    heap_insert(sch, q->F, q);
694	    }
695	}
696	/*
697	 * now compute V = max(V, min(S_i)). Remember that all elements in sch
698	 * have by definition S_i <= V so if sch is not empty, V is surely
699	 * the max and we must not update it. Conversely, if sch is empty
700	 * we only need to look at neh.
701	 */
702	if (sch->elements == 0 && neh->elements > 0)
703	    p->V = MAX64 ( p->V, neh->p[0].key );
704	/* move from neh to sch any packets that have become eligible */
705	while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) {
706	    struct dn_flow_queue *q = neh->p[0].object ;
707	    heap_extract(neh, NULL);
708	    heap_insert(sch, q->F, q);
709	}
710
711	if (p->if_name[0] != '\0') {/* tx clock is from a real thing */
712	    p->numbytes = -1 ; /* mark not ready for I/O */
713	    break ;
714	}
715    }
716    if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0
717	    && p->idle_heap.elements > 0) {
718	/*
719	 * no traffic and no events scheduled. We can get rid of idle-heap.
720	 */
721	int i ;
722
723	for (i = 0 ; i < p->idle_heap.elements ; i++) {
724	    struct dn_flow_queue *q = p->idle_heap.p[i].object ;
725
726	    q->F = 0 ;
727	    q->S = q->F + 1 ;
728	}
729	p->sum = 0 ;
730	p->V = 0 ;
731	p->idle_heap.elements = 0 ;
732    }
733    /*
734     * If we are getting clocks from dummynet (not a real interface) and
735     * If we are under credit, schedule the next ready event.
736     * Also fix the delivery time of the last packet.
737     */
738    if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */
739	dn_key t=0 ; /* number of ticks i have to wait */
740
741	if (p->bandwidth > 0)
742	    t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ;
743	dn_tag_get(p->tail)->output_time += t ;
744	p->sched_time = curr_time ;
745	heap_insert(&wfq_ready_heap, curr_time + t, (void *)p);
746	/* XXX should check errors on heap_insert, and drain the whole
747	 * queue on error hoping next time we are luckier.
748	 */
749    }
750    /*
751     * If the delay line was empty call transmit_event(p) now.
752     * Otherwise, the scheduler will take care of it.
753     */
754    if (p_was_empty)
755	transmit_event(p);
756}
757
758/*
759 * This is called every 1ms. It is used to
760 * increment the current tick counter and schedule expired events.
761 */
762static void
763dummynet(__unused void * unused)
764{
765    void *p ; /* generic parameter to handler */
766    struct dn_heap *h ;
767    struct dn_heap *heaps[3];
768    int i;
769    struct dn_pipe *pe ;
770    struct timespec ts;
771    struct timeval	tv;
772
773    heaps[0] = &ready_heap ;		/* fixed-rate queues */
774    heaps[1] = &wfq_ready_heap ;	/* wfq queues */
775    heaps[2] = &extract_heap ;		/* delay line */
776
777	lck_mtx_lock(dn_mutex);
778
779        /* make all time measurements in milliseconds (ms) -
780         * here we convert secs and usecs to msecs (just divide the
781	 * usecs and take the closest whole number).
782         */
783        microuptime(&tv);
784        curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
785
786    for (i=0; i < 3 ; i++) {
787	h = heaps[i];
788	while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) {
789		if (h->p[0].key > curr_time)
790		printf("dummynet: warning, heap %d is %d ticks late\n",
791			i, (int)(curr_time - h->p[0].key));
792		p = h->p[0].object ; /* store a copy before heap_extract */
793		heap_extract(h, NULL); /* need to extract before processing */
794		if (i == 0)
795		ready_event(p) ;
796		else if (i == 1) {
797		struct dn_pipe *pipe = p;
798		if (pipe->if_name[0] != '\0')
799			printf("dummynet: bad ready_event_wfq for pipe %s\n",
800			pipe->if_name);
801		else
802			ready_event_wfq(p) ;
803		} else
804		transmit_event(p);
805	}
806    }
807    /* sweep pipes trying to expire idle flow_queues */
808    for (pe = all_pipes; pe ; pe = pe->next )
809	if (pe->idle_heap.elements > 0 &&
810		DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) {
811	    struct dn_flow_queue *q = pe->idle_heap.p[0].object ;
812
813	    heap_extract(&(pe->idle_heap), NULL);
814	    q->S = q->F + 1 ; /* mark timestamp as invalid */
815	    pe->sum -= q->fs->weight ;
816	}
817
818	/* check the heaps to see if there's still stuff in there, and
819	 * only set the timer if there are packets to process
820	 */
821	timer_enabled = 0;
822	for (i=0; i < 3 ; i++) {
823		h = heaps[i];
824		if (h->elements > 0) { // set the timer
825			ts.tv_sec = 0;
826			ts.tv_nsec = 1 * 1000000;	// 1ms
827			timer_enabled = 1;
828			bsd_timeout(dummynet, NULL, &ts);
829			break;
830		}
831	}
832
833    lck_mtx_unlock(dn_mutex);
834}
835
836/*
837 * called by an interface when tx_rdy occurs.
838 */
839int
840if_tx_rdy(struct ifnet *ifp)
841{
842    struct dn_pipe *p;
843
844	lck_mtx_lock(dn_mutex);
845    for (p = all_pipes; p ; p = p->next )
846	if (p->ifp == ifp)
847	    break ;
848    if (p == NULL) {
849	char buf[32];
850	snprintf(buf, sizeof(buf), "%s%d",ifp->if_name, ifp->if_unit);
851	for (p = all_pipes; p ; p = p->next )
852	    if (!strcmp(p->if_name, buf) ) {
853		p->ifp = ifp ;
854		DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf));
855		break ;
856	    }
857    }
858    if (p != NULL) {
859	DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name,
860		ifp->if_unit, ifp->if_snd.ifq_len));
861	p->numbytes = 0 ; /* mark ready for I/O */
862	ready_event_wfq(p);
863    }
864	lck_mtx_lock(dn_mutex);
865
866    return 0;
867}
868
869/*
870 * Unconditionally expire empty queues in case of shortage.
871 * Returns the number of queues freed.
872 */
873static int
874expire_queues(struct dn_flow_set *fs)
875{
876    struct dn_flow_queue *q, *prev ;
877    int i, initial_elements = fs->rq_elements ;
878	struct timeval timenow;
879
880	getmicrotime(&timenow);
881
882    if (fs->last_expired == timenow.tv_sec)
883	return 0 ;
884    fs->last_expired = timenow.tv_sec ;
885    for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */
886	for (prev=NULL, q = fs->rq[i] ; q != NULL ; )
887	    if (q->head != NULL || q->S != q->F+1) {
888  		prev = q ;
889  	        q = q->next ;
890  	    } else { /* entry is idle, expire it */
891		struct dn_flow_queue *old_q = q ;
892
893		if (prev != NULL)
894		    prev->next = q = q->next ;
895		else
896		    fs->rq[i] = q = q->next ;
897		fs->rq_elements-- ;
898		FREE(old_q, M_DUMMYNET);
899	    }
900    return initial_elements - fs->rq_elements ;
901}
902
903/*
904 * If room, create a new queue and put at head of slot i;
905 * otherwise, create or use the default queue.
906 */
907static struct dn_flow_queue *
908create_queue(struct dn_flow_set *fs, int i)
909{
910    struct dn_flow_queue *q ;
911
912    if (fs->rq_elements > fs->rq_size * dn_max_ratio &&
913	    expire_queues(fs) == 0) {
914	/*
915	 * No way to get room, use or create overflow queue.
916	 */
917	i = fs->rq_size ;
918	if ( fs->rq[i] != NULL )
919	    return fs->rq[i] ;
920    }
921    q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO);
922    if (q == NULL) {
923	printf("dummynet: sorry, cannot allocate queue for new flow\n");
924	return NULL ;
925    }
926    q->fs = fs ;
927    q->hash_slot = i ;
928    q->next = fs->rq[i] ;
929    q->S = q->F + 1;   /* hack - mark timestamp as invalid */
930    fs->rq[i] = q ;
931    fs->rq_elements++ ;
932    return q ;
933}
934
935/*
936 * Given a flow_set and a pkt in last_pkt, find a matching queue
937 * after appropriate masking. The queue is moved to front
938 * so that further searches take less time.
939 */
940static struct dn_flow_queue *
941find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id)
942{
943    int i = 0 ; /* we need i and q for new allocations */
944    struct dn_flow_queue *q, *prev;
945
946    if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) )
947	q = fs->rq[0] ;
948    else {
949	/* first, do the masking */
950	id->dst_ip &= fs->flow_mask.dst_ip ;
951	id->src_ip &= fs->flow_mask.src_ip ;
952	id->dst_port &= fs->flow_mask.dst_port ;
953	id->src_port &= fs->flow_mask.src_port ;
954	id->proto &= fs->flow_mask.proto ;
955	id->flags = 0 ; /* we don't care about this one */
956	/* then, hash function */
957	i = ( (id->dst_ip) & 0xffff ) ^
958	    ( (id->dst_ip >> 15) & 0xffff ) ^
959	    ( (id->src_ip << 1) & 0xffff ) ^
960	    ( (id->src_ip >> 16 ) & 0xffff ) ^
961	    (id->dst_port << 1) ^ (id->src_port) ^
962	    (id->proto );
963	i = i % fs->rq_size ;
964	/* finally, scan the current list for a match */
965	searches++ ;
966	for (prev=NULL, q = fs->rq[i] ; q ; ) {
967	    search_steps++;
968	    if (id->dst_ip == q->id.dst_ip &&
969		    id->src_ip == q->id.src_ip &&
970		    id->dst_port == q->id.dst_port &&
971		    id->src_port == q->id.src_port &&
972		    id->proto == q->id.proto &&
973		    id->flags == q->id.flags)
974		break ; /* found */
975	    else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) {
976		/* entry is idle and not in any heap, expire it */
977		struct dn_flow_queue *old_q = q ;
978
979		if (prev != NULL)
980		    prev->next = q = q->next ;
981		else
982		    fs->rq[i] = q = q->next ;
983		fs->rq_elements-- ;
984		FREE(old_q, M_DUMMYNET);
985		continue ;
986	    }
987	    prev = q ;
988	    q = q->next ;
989	}
990	if (q && prev != NULL) { /* found and not in front */
991	    prev->next = q->next ;
992	    q->next = fs->rq[i] ;
993	    fs->rq[i] = q ;
994	}
995    }
996    if (q == NULL) { /* no match, need to allocate a new entry */
997	q = create_queue(fs, i);
998	if (q != NULL)
999	q->id = *id ;
1000    }
1001    return q ;
1002}
1003
1004static int
1005red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len)
1006{
1007    /*
1008     * RED algorithm
1009     *
1010     * RED calculates the average queue size (avg) using a low-pass filter
1011     * with an exponential weighted (w_q) moving average:
1012     * 	avg  <-  (1-w_q) * avg + w_q * q_size
1013     * where q_size is the queue length (measured in bytes or * packets).
1014     *
1015     * If q_size == 0, we compute the idle time for the link, and set
1016     *	avg = (1 - w_q)^(idle/s)
1017     * where s is the time needed for transmitting a medium-sized packet.
1018     *
1019     * Now, if avg < min_th the packet is enqueued.
1020     * If avg > max_th the packet is dropped. Otherwise, the packet is
1021     * dropped with probability P function of avg.
1022     *
1023     */
1024
1025    int64_t p_b = 0;
1026    /* queue in bytes or packets ? */
1027    u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len;
1028
1029    DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size));
1030
1031    /* average queue size estimation */
1032    if (q_size != 0) {
1033	/*
1034	 * queue is not empty, avg <- avg + (q_size - avg) * w_q
1035	 */
1036	int diff = SCALE(q_size) - q->avg;
1037	int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q);
1038
1039	q->avg += (int) v;
1040    } else {
1041	/*
1042	 * queue is empty, find for how long the queue has been
1043	 * empty and use a lookup table for computing
1044	 * (1 - * w_q)^(idle_time/s) where s is the time to send a
1045	 * (small) packet.
1046	 * XXX check wraps...
1047	 */
1048	if (q->avg) {
1049	    u_int t = (curr_time - q->q_time) / fs->lookup_step;
1050
1051	    q->avg = (t < fs->lookup_depth) ?
1052		    SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0;
1053	}
1054    }
1055    DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg)));
1056
1057    /* should i drop ? */
1058
1059    if (q->avg < fs->min_th) {
1060	q->count = -1;
1061	return 0; /* accept packet ; */
1062    }
1063    if (q->avg >= fs->max_th) { /* average queue >=  max threshold */
1064	if (fs->flags_fs & DN_IS_GENTLE_RED) {
1065	    /*
1066	     * According to Gentle-RED, if avg is greater than max_th the
1067	     * packet is dropped with a probability
1068	     *	p_b = c_3 * avg - c_4
1069	     * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p
1070	     */
1071	    p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4;
1072	} else {
1073	    q->count = -1;
1074	    DPRINTF(("dummynet: - drop"));
1075	    return 1 ;
1076	}
1077    } else if (q->avg > fs->min_th) {
1078	/*
1079	 * we compute p_b using the linear dropping function p_b = c_1 *
1080	 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 =
1081	 * max_p * min_th / (max_th - min_th)
1082	 */
1083	p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2;
1084    }
1085    if (fs->flags_fs & DN_QSIZE_IS_BYTES)
1086	p_b = (p_b * len) / fs->max_pkt_size;
1087    if (++q->count == 0)
1088	q->random = MY_RANDOM & 0xffff;
1089    else {
1090	/*
1091	 * q->count counts packets arrived since last drop, so a greater
1092	 * value of q->count means a greater packet drop probability.
1093	 */
1094	if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) {
1095	    q->count = 0;
1096	    DPRINTF(("dummynet: - red drop"));
1097	    /* after a drop we calculate a new random value */
1098	    q->random = MY_RANDOM & 0xffff;
1099	    return 1;    /* drop */
1100	}
1101    }
1102    /* end of RED algorithm */
1103    return 0 ; /* accept */
1104}
1105
1106static __inline
1107struct dn_flow_set *
1108locate_flowset(int pipe_nr, struct ip_fw *rule)
1109{
1110    struct dn_flow_set *fs;
1111    ipfw_insn *cmd = rule->cmd + rule->act_ofs;
1112
1113    if (cmd->opcode == O_LOG)
1114	cmd += F_LEN(cmd);
1115
1116    bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs));
1117
1118    if (fs != NULL)
1119	return fs;
1120
1121    if (cmd->opcode == O_QUEUE) {
1122		for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next)
1123	 	   ;
1124	}
1125    else {
1126		struct dn_pipe *p1;
1127		for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next)
1128			;
1129		if (p1 != NULL)
1130			fs = &(p1->fs) ;
1131    }
1132    /* record for the future */
1133    bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs));
1134
1135    return fs ;
1136}
1137
1138/*
1139 * dummynet hook for packets. Below 'pipe' is a pipe or a queue
1140 * depending on whether WF2Q or fixed bw is used.
1141 *
1142 * pipe_nr	pipe or queue the packet is destined for.
1143 * dir		where shall we send the packet after dummynet.
1144 * m		the mbuf with the packet
1145 * ifp		the 'ifp' parameter from the caller.
1146 *		NULL in ip_input, destination interface in ip_output,
1147 *		real_dst in bdg_forward
1148 * ro		route parameter (only used in ip_output, NULL otherwise)
1149 * dst		destination address, only used by ip_output
1150 * rule		matching rule, in case of multiple passes
1151 * flags	flags from the caller, only used in ip_output
1152 *
1153 */
1154static int
1155dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa)
1156{
1157    struct dn_pkt_tag *pkt;
1158    struct m_tag *mtag;
1159    struct dn_flow_set *fs;
1160    struct dn_pipe *pipe ;
1161    u_int64_t len = m->m_pkthdr.len ;
1162    struct dn_flow_queue *q = NULL ;
1163    int is_pipe;
1164    struct timespec ts;
1165    struct timeval	tv;
1166
1167#if IPFW2
1168    ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs;
1169
1170    if (cmd->opcode == O_LOG)
1171	cmd += F_LEN(cmd);
1172    is_pipe = (cmd->opcode == O_PIPE);
1173#else
1174    is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE;
1175#endif
1176
1177    pipe_nr &= 0xffff ;
1178
1179 	lck_mtx_lock(dn_mutex);
1180
1181	/* make all time measurements in milliseconds (ms) -
1182         * here we convert secs and usecs to msecs (just divide the
1183         * usecs and take the closest whole number).
1184	 */
1185        microuptime(&tv);
1186	curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000);
1187
1188   /*
1189     * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule.
1190     */
1191    fs = locate_flowset(pipe_nr, fwa->rule);
1192    if (fs == NULL)
1193	goto dropit ;	/* this queue/pipe does not exist! */
1194    pipe = fs->pipe ;
1195    if (pipe == NULL) { /* must be a queue, try find a matching pipe */
1196	for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr;
1197		 pipe = pipe->next)
1198	    ;
1199	if (pipe != NULL)
1200	    fs->pipe = pipe ;
1201	else {
1202	    printf("dummynet: no pipe %d for queue %d, drop pkt\n",
1203		fs->parent_nr, fs->fs_nr);
1204	    goto dropit ;
1205	}
1206    }
1207    q = find_queue(fs, &(fwa->f_id));
1208    if ( q == NULL )
1209	goto dropit ;		/* cannot allocate queue		*/
1210    /*
1211     * update statistics, then check reasons to drop pkt
1212     */
1213    q->tot_bytes += len ;
1214    q->tot_pkts++ ;
1215    if ( fs->plr && (MY_RANDOM < fs->plr) )
1216	goto dropit ;		/* random pkt drop			*/
1217    if ( fs->flags_fs & DN_QSIZE_IS_BYTES) {
1218    	if (q->len_bytes > fs->qsize)
1219	    goto dropit ;	/* queue size overflow			*/
1220    } else {
1221	if (q->len >= fs->qsize)
1222	    goto dropit ;	/* queue count overflow			*/
1223    }
1224    if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) )
1225	goto dropit ;
1226
1227    /* XXX expensive to zero, see if we can remove it*/
1228    mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET,
1229    		sizeof(struct dn_pkt_tag), M_NOWAIT);
1230    if ( mtag == NULL )
1231		goto dropit ;		/* cannot allocate packet header	*/
1232    m_tag_prepend(m, mtag);	/* attach to mbuf chain */
1233
1234    pkt = (struct dn_pkt_tag *)(mtag+1);
1235    bzero(pkt, sizeof(struct dn_pkt_tag));
1236    /* ok, i can handle the pkt now... */
1237    /* build and enqueue packet + parameters */
1238    pkt->rule = fwa->rule ;
1239    pkt->dn_dir = dir ;
1240
1241    pkt->ifp = fwa->oif;
1242    if (dir == DN_TO_IP_OUT) {
1243	/*
1244	 * We need to copy *ro because for ICMP pkts (and maybe others)
1245	 * the caller passed a pointer into the stack; dst might also be
1246	 * a pointer into *ro so it needs to be updated.
1247	 */
1248	lck_mtx_lock(rt_mtx);
1249	pkt->ro = *(fwa->ro);
1250	if (fwa->ro->ro_rt)
1251	    rtref(fwa->ro->ro_rt);
1252	if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */
1253	    fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ;
1254	lck_mtx_unlock(rt_mtx);
1255
1256	pkt->dn_dst = fwa->dst;
1257	pkt->flags = fwa->flags;
1258	if (fwa->ipoa != NULL)
1259		pkt->ipoa = *(fwa->ipoa);
1260	}
1261    if (q->head == NULL)
1262	q->head = m;
1263    else
1264	q->tail->m_nextpkt = m;
1265    q->tail = m;
1266    q->len++;
1267    q->len_bytes += len ;
1268
1269    if ( q->head != m )		/* flow was not idle, we are done */
1270	goto done;
1271    /*
1272     * If we reach this point the flow was previously idle, so we need
1273     * to schedule it. This involves different actions for fixed-rate or
1274     * WF2Q queues.
1275     */
1276    if (is_pipe) {
1277	/*
1278	 * Fixed-rate queue: just insert into the ready_heap.
1279	 */
1280	dn_key t = 0 ;
1281	if (pipe->bandwidth)
1282	    t = SET_TICKS(m, q, pipe);
1283	q->sched_time = curr_time ;
1284	if (t == 0)	/* must process it now */
1285	    ready_event( q );
1286	else
1287	    heap_insert(&ready_heap, curr_time + t , q );
1288    } else {
1289	/*
1290	 * WF2Q. First, compute start time S: if the flow was idle (S=F+1)
1291	 * set S to the virtual time V for the controlling pipe, and update
1292	 * the sum of weights for the pipe; otherwise, remove flow from
1293	 * idle_heap and set S to max(F,V).
1294	 * Second, compute finish time F = S + len/weight.
1295	 * Third, if pipe was idle, update V=max(S, V).
1296	 * Fourth, count one more backlogged flow.
1297	 */
1298	if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */
1299	    q->S = pipe->V ;
1300	    pipe->sum += fs->weight ; /* add weight of new queue */
1301	} else {
1302	    heap_extract(&(pipe->idle_heap), q);
1303	    q->S = MAX64(q->F, pipe->V ) ;
1304	}
1305	q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight;
1306
1307	if (pipe->not_eligible_heap.elements == 0 &&
1308		pipe->scheduler_heap.elements == 0)
1309	    pipe->V = MAX64 ( q->S, pipe->V );
1310	fs->backlogged++ ;
1311	/*
1312	 * Look at eligibility. A flow is not eligibile if S>V (when
1313	 * this happens, it means that there is some other flow already
1314	 * scheduled for the same pipe, so the scheduler_heap cannot be
1315	 * empty). If the flow is not eligible we just store it in the
1316	 * not_eligible_heap. Otherwise, we store in the scheduler_heap
1317	 * and possibly invoke ready_event_wfq() right now if there is
1318	 * leftover credit.
1319	 * Note that for all flows in scheduler_heap (SCH), S_i <= V,
1320	 * and for all flows in not_eligible_heap (NEH), S_i > V .
1321	 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH,
1322	 * we only need to look into NEH.
1323	 */
1324	if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */
1325	    if (pipe->scheduler_heap.elements == 0)
1326		printf("dummynet: ++ ouch! not eligible but empty scheduler!\n");
1327	    heap_insert(&(pipe->not_eligible_heap), q->S, q);
1328	} else {
1329	    heap_insert(&(pipe->scheduler_heap), q->F, q);
1330	    if (pipe->numbytes >= 0) { /* pipe is idle */
1331		if (pipe->scheduler_heap.elements != 1)
1332		    printf("dummynet: OUCH! pipe should have been idle!\n");
1333		DPRINTF(("dummynet: waking up pipe %d at %d\n",
1334			pipe->pipe_nr, (int)(q->F >> MY_M)));
1335		pipe->sched_time = curr_time ;
1336		ready_event_wfq(pipe);
1337	    }
1338	}
1339    }
1340done:
1341	/* start the timer and set global if not already set */
1342	if (!timer_enabled) {
1343		ts.tv_sec = 0;
1344		ts.tv_nsec = 1 * 1000000;	// 1ms
1345		timer_enabled = 1;
1346		bsd_timeout(dummynet, NULL, &ts);
1347    }
1348
1349	lck_mtx_unlock(dn_mutex);
1350    return 0;
1351
1352dropit:
1353    if (q)
1354	q->drops++ ;
1355	lck_mtx_unlock(dn_mutex);
1356    m_freem(m);
1357    return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS);
1358}
1359
1360/*
1361 * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT)
1362 * Doing this would probably save us the initial bzero of dn_pkt
1363 */
1364#define	DN_FREE_PKT(_m) do {				\
1365	struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \
1366	if (tag) { 					\
1367		struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1);	\
1368		if (n->ro.ro_rt) {				\
1369			rtfree(n->ro.ro_rt);	\
1370			n->ro.ro_rt = NULL;	\
1371		}				\
1372	}									\
1373	m_tag_delete(_m, tag);			\
1374	m_freem(_m);					\
1375} while (0)
1376
1377/*
1378 * Dispose all packets and flow_queues on a flow_set.
1379 * If all=1, also remove red lookup table and other storage,
1380 * including the descriptor itself.
1381 * For the one in dn_pipe MUST also cleanup ready_heap...
1382 */
1383static void
1384purge_flow_set(struct dn_flow_set *fs, int all)
1385{
1386    struct dn_flow_queue *q, *qn ;
1387    int i ;
1388
1389	lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1390
1391    for (i = 0 ; i <= fs->rq_size ; i++ ) {
1392	for (q = fs->rq[i] ; q ; q = qn ) {
1393	    struct mbuf *m, *mnext;
1394
1395	    mnext = q->head;
1396	    while ((m = mnext) != NULL) {
1397		mnext = m->m_nextpkt;
1398		DN_FREE_PKT(m);
1399	    }
1400	    qn = q->next ;
1401	    FREE(q, M_DUMMYNET);
1402	}
1403	fs->rq[i] = NULL ;
1404    }
1405    fs->rq_elements = 0 ;
1406    if (all) {
1407	/* RED - free lookup table */
1408	if (fs->w_q_lookup)
1409	    FREE(fs->w_q_lookup, M_DUMMYNET);
1410	if (fs->rq)
1411	    FREE(fs->rq, M_DUMMYNET);
1412	/* if this fs is not part of a pipe, free it */
1413	if (fs->pipe && fs != &(fs->pipe->fs) )
1414	    FREE(fs, M_DUMMYNET);
1415    }
1416}
1417
1418/*
1419 * Dispose all packets queued on a pipe (not a flow_set).
1420 * Also free all resources associated to a pipe, which is about
1421 * to be deleted.
1422 */
1423static void
1424purge_pipe(struct dn_pipe *pipe)
1425{
1426    struct mbuf *m, *mnext;
1427
1428    purge_flow_set( &(pipe->fs), 1 );
1429
1430    mnext = pipe->head;
1431    while ((m = mnext) != NULL) {
1432	mnext = m->m_nextpkt;
1433	DN_FREE_PKT(m);
1434    }
1435
1436    heap_free( &(pipe->scheduler_heap) );
1437    heap_free( &(pipe->not_eligible_heap) );
1438    heap_free( &(pipe->idle_heap) );
1439}
1440
1441/*
1442 * Delete all pipes and heaps returning memory. Must also
1443 * remove references from all ipfw rules to all pipes.
1444 */
1445static void
1446dummynet_flush(void)
1447{
1448    struct dn_pipe *curr_p, *p ;
1449    struct dn_flow_set *fs, *curr_fs;
1450
1451	lck_mtx_lock(dn_mutex);
1452
1453    /* remove all references to pipes ...*/
1454    flush_pipe_ptrs(NULL);
1455    /* prevent future matches... */
1456    p = all_pipes ;
1457    all_pipes = NULL ;
1458    fs = all_flow_sets ;
1459    all_flow_sets = NULL ;
1460    /* and free heaps so we don't have unwanted events */
1461    heap_free(&ready_heap);
1462    heap_free(&wfq_ready_heap);
1463    heap_free(&extract_heap);
1464
1465    /*
1466     * Now purge all queued pkts and delete all pipes
1467     */
1468    /* scan and purge all flow_sets. */
1469    for ( ; fs ; ) {
1470	curr_fs = fs ;
1471	fs = fs->next ;
1472	purge_flow_set(curr_fs, 1);
1473    }
1474    for ( ; p ; ) {
1475	purge_pipe(p);
1476	curr_p = p ;
1477	p = p->next ;
1478	FREE(curr_p, M_DUMMYNET);
1479    }
1480	lck_mtx_unlock(dn_mutex);
1481}
1482
1483
1484extern struct ip_fw *ip_fw_default_rule ;
1485static void
1486dn_rule_delete_fs(struct dn_flow_set *fs, void *r)
1487{
1488    int i ;
1489    struct dn_flow_queue *q ;
1490    struct mbuf *m ;
1491
1492    for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */
1493	for (q = fs->rq[i] ; q ; q = q->next )
1494	    for (m = q->head ; m ; m = m->m_nextpkt ) {
1495		struct dn_pkt_tag *pkt = dn_tag_get(m) ;
1496		if (pkt->rule == r)
1497		    pkt->rule = ip_fw_default_rule ;
1498	    }
1499}
1500/*
1501 * when a firewall rule is deleted, scan all queues and remove the flow-id
1502 * from packets matching this rule.
1503 */
1504void
1505dn_rule_delete(void *r)
1506{
1507    struct dn_pipe *p ;
1508    struct dn_flow_set *fs ;
1509    struct dn_pkt_tag *pkt ;
1510    struct mbuf *m ;
1511
1512	lck_mtx_lock(dn_mutex);
1513
1514    /*
1515     * If the rule references a queue (dn_flow_set), then scan
1516     * the flow set, otherwise scan pipes. Should do either, but doing
1517     * both does not harm.
1518     */
1519    for ( fs = all_flow_sets ; fs ; fs = fs->next )
1520	dn_rule_delete_fs(fs, r);
1521    for ( p = all_pipes ; p ; p = p->next ) {
1522	fs = &(p->fs) ;
1523	dn_rule_delete_fs(fs, r);
1524	for (m = p->head ; m ; m = m->m_nextpkt ) {
1525	    pkt = dn_tag_get(m) ;
1526	    if (pkt->rule == r)
1527		pkt->rule = ip_fw_default_rule ;
1528	}
1529    }
1530    lck_mtx_unlock(dn_mutex);
1531}
1532
1533/*
1534 * setup RED parameters
1535 */
1536static int
1537config_red(struct dn_flow_set *p, struct dn_flow_set * x)
1538{
1539    int i;
1540
1541    x->w_q = p->w_q;
1542    x->min_th = SCALE(p->min_th);
1543    x->max_th = SCALE(p->max_th);
1544    x->max_p = p->max_p;
1545
1546    x->c_1 = p->max_p / (p->max_th - p->min_th);
1547    x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th));
1548    if (x->flags_fs & DN_IS_GENTLE_RED) {
1549	x->c_3 = (SCALE(1) - p->max_p) / p->max_th;
1550	x->c_4 = (SCALE(1) - 2 * p->max_p);
1551    }
1552
1553    /* if the lookup table already exist, free and create it again */
1554    if (x->w_q_lookup) {
1555	FREE(x->w_q_lookup, M_DUMMYNET);
1556	x->w_q_lookup = NULL ;
1557    }
1558    if (red_lookup_depth == 0) {
1559	printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n");
1560	FREE(x, M_DUMMYNET);
1561	return EINVAL;
1562    }
1563    x->lookup_depth = red_lookup_depth;
1564    x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int),
1565	    M_DUMMYNET, M_DONTWAIT);
1566    if (x->w_q_lookup == NULL) {
1567	printf("dummynet: sorry, cannot allocate red lookup table\n");
1568	FREE(x, M_DUMMYNET);
1569	return ENOSPC;
1570    }
1571
1572    /* fill the lookup table with (1 - w_q)^x */
1573    x->lookup_step = p->lookup_step ;
1574    x->lookup_weight = p->lookup_weight ;
1575    x->w_q_lookup[0] = SCALE(1) - x->w_q;
1576    for (i = 1; i < x->lookup_depth; i++)
1577	x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight);
1578    if (red_avg_pkt_size < 1)
1579	red_avg_pkt_size = 512 ;
1580    x->avg_pkt_size = red_avg_pkt_size ;
1581    if (red_max_pkt_size < 1)
1582	red_max_pkt_size = 1500 ;
1583    x->max_pkt_size = red_max_pkt_size ;
1584    return 0 ;
1585}
1586
1587static int
1588alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs)
1589{
1590    if (x->flags_fs & DN_HAVE_FLOW_MASK) {     /* allocate some slots */
1591	int l = pfs->rq_size;
1592
1593	if (l == 0)
1594	    l = dn_hash_size;
1595	if (l < 4)
1596	    l = 4;
1597	else if (l > DN_MAX_HASH_SIZE)
1598	    l = DN_MAX_HASH_SIZE;
1599	x->rq_size = l;
1600    } else                  /* one is enough for null mask */
1601	x->rq_size = 1;
1602    x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *),
1603	    M_DUMMYNET, M_DONTWAIT | M_ZERO);
1604    if (x->rq == NULL) {
1605	printf("dummynet: sorry, cannot allocate queue\n");
1606	return ENOSPC;
1607    }
1608    x->rq_elements = 0;
1609    return 0 ;
1610}
1611
1612static void
1613set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src)
1614{
1615    x->flags_fs = src->flags_fs;
1616    x->qsize = src->qsize;
1617    x->plr = src->plr;
1618    x->flow_mask = src->flow_mask;
1619    if (x->flags_fs & DN_QSIZE_IS_BYTES) {
1620	if (x->qsize > 1024*1024)
1621	    x->qsize = 1024*1024 ;
1622    } else {
1623	if (x->qsize == 0)
1624	    x->qsize = 50 ;
1625	if (x->qsize > 100)
1626	    x->qsize = 50 ;
1627    }
1628    /* configuring RED */
1629    if ( x->flags_fs & DN_IS_RED )
1630	config_red(src, x) ;    /* XXX should check errors */
1631}
1632
1633/*
1634 * setup pipe or queue parameters.
1635 */
1636
1637static int
1638config_pipe(struct dn_pipe *p)
1639{
1640    int i, r;
1641    struct dn_flow_set *pfs = &(p->fs);
1642    struct dn_flow_queue *q;
1643
1644    /*
1645     * The config program passes parameters as follows:
1646     * bw = bits/second (0 means no limits),
1647     * delay = ms, must be translated into ticks.
1648     * qsize = slots/bytes
1649     */
1650    p->delay = ( p->delay * (hz*10) ) / 1000 ;
1651    /* We need either a pipe number or a flow_set number */
1652    if (p->pipe_nr == 0 && pfs->fs_nr == 0)
1653	return EINVAL ;
1654    if (p->pipe_nr != 0 && pfs->fs_nr != 0)
1655	return EINVAL ;
1656    if (p->pipe_nr != 0) { /* this is a pipe */
1657	struct dn_pipe *x, *a, *b;
1658
1659	lck_mtx_lock(dn_mutex);
1660/* locate pipe */
1661	for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1662		 a = b , b = b->next) ;
1663
1664	if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */
1665	    x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ;
1666	    if (x == NULL) {
1667	    lck_mtx_unlock(dn_mutex);
1668		printf("dummynet: no memory for new pipe\n");
1669		return ENOSPC;
1670	    }
1671	    x->pipe_nr = p->pipe_nr;
1672	    x->fs.pipe = x ;
1673	    /* idle_heap is the only one from which we extract from the middle.
1674	     */
1675	    x->idle_heap.size = x->idle_heap.elements = 0 ;
1676	    x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos);
1677	} else {
1678	    x = b;
1679	    /* Flush accumulated credit for all queues */
1680	    for (i = 0; i <= x->fs.rq_size; i++)
1681		for (q = x->fs.rq[i]; q; q = q->next)
1682		    q->numbytes = 0;
1683	}
1684
1685	x->bandwidth = p->bandwidth ;
1686	x->numbytes = 0; /* just in case... */
1687	bcopy(p->if_name, x->if_name, sizeof(p->if_name) );
1688	x->ifp = NULL ; /* reset interface ptr */
1689	x->delay = p->delay ;
1690	set_fs_parms(&(x->fs), pfs);
1691
1692
1693	if ( x->fs.rq == NULL ) { /* a new pipe */
1694	    r = alloc_hash(&(x->fs), pfs) ;
1695	    if (r) {
1696		lck_mtx_unlock(dn_mutex);
1697		FREE(x, M_DUMMYNET);
1698		return r ;
1699	    }
1700	    x->next = b ;
1701	    if (a == NULL)
1702		all_pipes = x ;
1703	    else
1704		a->next = x ;
1705	}
1706	lck_mtx_unlock(dn_mutex);
1707    } else { /* config queue */
1708	struct dn_flow_set *x, *a, *b ;
1709
1710	lck_mtx_lock(dn_mutex);
1711	/* locate flow_set */
1712	for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ;
1713		 a = b , b = b->next) ;
1714
1715	if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new  */
1716	    if (pfs->parent_nr == 0) {	/* need link to a pipe */
1717	    	lck_mtx_unlock(dn_mutex);
1718			return EINVAL ;
1719		}
1720	    x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO);
1721	    if (x == NULL) {
1722	    	lck_mtx_unlock(dn_mutex);
1723			printf("dummynet: no memory for new flow_set\n");
1724			return ENOSPC;
1725	    }
1726	    x->fs_nr = pfs->fs_nr;
1727	    x->parent_nr = pfs->parent_nr;
1728	    x->weight = pfs->weight ;
1729	    if (x->weight == 0)
1730		x->weight = 1 ;
1731	    else if (x->weight > 100)
1732		x->weight = 100 ;
1733	} else {
1734	    /* Change parent pipe not allowed; must delete and recreate */
1735	    if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) {
1736	    	lck_mtx_unlock(dn_mutex);
1737			return EINVAL ;
1738		}
1739	    x = b;
1740	}
1741	set_fs_parms(x, pfs);
1742
1743	if ( x->rq == NULL ) { /* a new flow_set */
1744	    r = alloc_hash(x, pfs) ;
1745	    if (r) {
1746		lck_mtx_unlock(dn_mutex);
1747		FREE(x, M_DUMMYNET);
1748		return r ;
1749	    }
1750	    x->next = b;
1751	    if (a == NULL)
1752		all_flow_sets = x;
1753	    else
1754		a->next = x;
1755	}
1756	lck_mtx_unlock(dn_mutex);
1757    }
1758    return 0 ;
1759}
1760
1761/*
1762 * Helper function to remove from a heap queues which are linked to
1763 * a flow_set about to be deleted.
1764 */
1765static void
1766fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs)
1767{
1768    int i = 0, found = 0 ;
1769    for (; i < h->elements ;)
1770	if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) {
1771	    h->elements-- ;
1772	    h->p[i] = h->p[h->elements] ;
1773	    found++ ;
1774	} else
1775	    i++ ;
1776    if (found)
1777	heapify(h);
1778}
1779
1780/*
1781 * helper function to remove a pipe from a heap (can be there at most once)
1782 */
1783static void
1784pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p)
1785{
1786    if (h->elements > 0) {
1787	int i = 0 ;
1788	for (i=0; i < h->elements ; i++ ) {
1789	    if (h->p[i].object == p) { /* found it */
1790		h->elements-- ;
1791		h->p[i] = h->p[h->elements] ;
1792		heapify(h);
1793		break ;
1794	    }
1795	}
1796    }
1797}
1798
1799/*
1800 * drain all queues. Called in case of severe mbuf shortage.
1801 */
1802void
1803dummynet_drain(void)
1804{
1805    struct dn_flow_set *fs;
1806    struct dn_pipe *p;
1807    struct mbuf *m, *mnext;
1808
1809	lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1810
1811    heap_free(&ready_heap);
1812    heap_free(&wfq_ready_heap);
1813    heap_free(&extract_heap);
1814    /* remove all references to this pipe from flow_sets */
1815    for (fs = all_flow_sets; fs; fs= fs->next )
1816	purge_flow_set(fs, 0);
1817
1818    for (p = all_pipes; p; p= p->next ) {
1819	purge_flow_set(&(p->fs), 0);
1820
1821	mnext = p->head;
1822	while ((m = mnext) != NULL) {
1823	    mnext = m->m_nextpkt;
1824	    DN_FREE_PKT(m);
1825	}
1826	p->head = p->tail = NULL ;
1827    }
1828}
1829
1830/*
1831 * Fully delete a pipe or a queue, cleaning up associated info.
1832 */
1833static int
1834delete_pipe(struct dn_pipe *p)
1835{
1836    if (p->pipe_nr == 0 && p->fs.fs_nr == 0)
1837	return EINVAL ;
1838    if (p->pipe_nr != 0 && p->fs.fs_nr != 0)
1839	return EINVAL ;
1840    if (p->pipe_nr != 0) { /* this is an old-style pipe */
1841	struct dn_pipe *a, *b;
1842	struct dn_flow_set *fs;
1843
1844	lck_mtx_lock(dn_mutex);
1845	/* locate pipe */
1846	for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ;
1847		 a = b , b = b->next) ;
1848	if (b == NULL || (b->pipe_nr != p->pipe_nr) ) {
1849		lck_mtx_unlock(dn_mutex);
1850	    return EINVAL ; /* not found */
1851	}
1852
1853	/* unlink from list of pipes */
1854	if (a == NULL)
1855	    all_pipes = b->next ;
1856	else
1857	    a->next = b->next ;
1858	/* remove references to this pipe from the ip_fw rules. */
1859	flush_pipe_ptrs(&(b->fs));
1860
1861	/* remove all references to this pipe from flow_sets */
1862	for (fs = all_flow_sets; fs; fs= fs->next )
1863	    if (fs->pipe == b) {
1864		printf("dummynet: ++ ref to pipe %d from fs %d\n",
1865			p->pipe_nr, fs->fs_nr);
1866		fs->pipe = NULL ;
1867		purge_flow_set(fs, 0);
1868	    }
1869	fs_remove_from_heap(&ready_heap, &(b->fs));
1870	purge_pipe(b);	/* remove all data associated to this pipe */
1871	/* remove reference to here from extract_heap and wfq_ready_heap */
1872	pipe_remove_from_heap(&extract_heap, b);
1873	pipe_remove_from_heap(&wfq_ready_heap, b);
1874	lck_mtx_unlock(dn_mutex);
1875
1876	FREE(b, M_DUMMYNET);
1877    } else { /* this is a WF2Q queue (dn_flow_set) */
1878	struct dn_flow_set *a, *b;
1879
1880	lck_mtx_lock(dn_mutex);
1881	/* locate set */
1882	for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ;
1883		 a = b , b = b->next) ;
1884	if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) {
1885		lck_mtx_unlock(dn_mutex);
1886	    return EINVAL ; /* not found */
1887	}
1888
1889	if (a == NULL)
1890	    all_flow_sets = b->next ;
1891	else
1892	    a->next = b->next ;
1893	/* remove references to this flow_set from the ip_fw rules. */
1894	flush_pipe_ptrs(b);
1895
1896	if (b->pipe != NULL) {
1897	    /* Update total weight on parent pipe and cleanup parent heaps */
1898	    b->pipe->sum -= b->weight * b->backlogged ;
1899	    fs_remove_from_heap(&(b->pipe->not_eligible_heap), b);
1900	    fs_remove_from_heap(&(b->pipe->scheduler_heap), b);
1901#if 1	/* XXX should i remove from idle_heap as well ? */
1902	    fs_remove_from_heap(&(b->pipe->idle_heap), b);
1903#endif
1904	}
1905	purge_flow_set(b, 1);
1906	lck_mtx_unlock(dn_mutex);
1907    }
1908    return 0 ;
1909}
1910
1911/*
1912 * helper function used to copy data from kernel in DUMMYNET_GET
1913 */
1914static char *
1915dn_copy_set(struct dn_flow_set *set, char *bp)
1916{
1917    int i, copied = 0 ;
1918    struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp;
1919
1920	lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1921
1922    for (i = 0 ; i <= set->rq_size ; i++)
1923	for (q = set->rq[i] ; q ; q = q->next, qp++ ) {
1924	    if (q->hash_slot != i)
1925		printf("dummynet: ++ at %d: wrong slot (have %d, "
1926		    "should be %d)\n", copied, q->hash_slot, i);
1927	    if (q->fs != set)
1928		printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n",
1929			i, q->fs, set);
1930	    copied++ ;
1931	    bcopy(q, qp, sizeof(*q));
1932	    /* cleanup pointers */
1933	    qp->next = NULL ;
1934	    qp->head = qp->tail = NULL ;
1935	    qp->fs = NULL ;
1936	}
1937    if (copied != set->rq_elements)
1938	printf("dummynet: ++ wrong count, have %d should be %d\n",
1939	    copied, set->rq_elements);
1940    return (char *)qp ;
1941}
1942
1943static size_t
1944dn_calc_size(void)
1945{
1946    struct dn_flow_set *set ;
1947    struct dn_pipe *p ;
1948    size_t size ;
1949
1950	lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED);
1951
1952    /*
1953     * compute size of data structures: list of pipes and flow_sets.
1954     */
1955    for (p = all_pipes, size = 0 ; p ; p = p->next )
1956	size += sizeof(*p) +
1957	    p->fs.rq_elements * sizeof(struct dn_flow_queue);
1958    for (set = all_flow_sets ; set ; set = set->next )
1959	size += sizeof(*set) +
1960	    set->rq_elements * sizeof(struct dn_flow_queue);
1961    return size ;
1962}
1963
1964static int
1965dummynet_get(struct sockopt *sopt)
1966{
1967    char *buf, *bp ; /* bp is the "copy-pointer" */
1968    size_t size ;
1969    struct dn_flow_set *set ;
1970    struct dn_pipe *p ;
1971    int error=0, i ;
1972
1973    /* XXX lock held too long */
1974    lck_mtx_lock(dn_mutex);
1975    /*
1976     * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we
1977     *      cannot use this flag while holding a mutex.
1978     */
1979    for (i = 0; i < 10; i++) {
1980		size = dn_calc_size();
1981		lck_mtx_unlock(dn_mutex);
1982		buf = _MALLOC(size, M_TEMP, M_WAITOK);
1983		lck_mtx_lock(dn_mutex);
1984		if (size == dn_calc_size())
1985			break;
1986		FREE(buf, M_TEMP);
1987		buf = NULL;
1988    }
1989    if (buf == NULL) {
1990		lck_mtx_unlock(dn_mutex);
1991		return ENOBUFS ;
1992    }
1993    for (p = all_pipes, bp = buf ; p ; p = p->next ) {
1994	struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ;
1995
1996	/*
1997	 * copy pipe descriptor into *bp, convert delay back to ms,
1998	 * then copy the flow_set descriptor(s) one at a time.
1999	 * After each flow_set, copy the queue descriptor it owns.
2000	 */
2001	bcopy(p, bp, sizeof(*p));
2002	pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ;
2003	/*
2004	 * XXX the following is a hack based on ->next being the
2005	 * first field in dn_pipe and dn_flow_set. The correct
2006	 * solution would be to move the dn_flow_set to the beginning
2007	 * of struct dn_pipe.
2008	 */
2009	pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ;
2010	/* clean pointers */
2011	pipe_bp->head = pipe_bp->tail = NULL ;
2012	pipe_bp->fs.next = NULL ;
2013	pipe_bp->fs.pipe = NULL ;
2014	pipe_bp->fs.rq = NULL ;
2015
2016	bp += sizeof(*p);
2017	bp = dn_copy_set( &(p->fs), bp );
2018    }
2019    for (set = all_flow_sets ; set ; set = set->next ) {
2020	struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ;
2021	bcopy(set, bp, sizeof(*set));
2022	/* XXX same hack as above */
2023	fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ;
2024	fs_bp->pipe = NULL ;
2025	fs_bp->rq = NULL ;
2026	bp += sizeof(*set);
2027	bp = dn_copy_set( set, bp );
2028    }
2029    lck_mtx_unlock(dn_mutex);
2030
2031    error = sooptcopyout(sopt, buf, size);
2032    FREE(buf, M_TEMP);
2033    return error ;
2034}
2035
2036/*
2037 * Handler for the various dummynet socket options (get, flush, config, del)
2038 */
2039static int
2040ip_dn_ctl(struct sockopt *sopt)
2041{
2042    int error = 0 ;
2043    struct dn_pipe *p, tmp_pipe;
2044
2045    /* Disallow sets in really-really secure mode. */
2046    if (sopt->sopt_dir == SOPT_SET && securelevel >= 3)
2047	return (EPERM);
2048
2049    switch (sopt->sopt_name) {
2050    default :
2051	printf("dummynet: -- unknown option %d", sopt->sopt_name);
2052	return EINVAL ;
2053
2054    case IP_DUMMYNET_GET :
2055	error = dummynet_get(sopt);
2056	break ;
2057
2058    case IP_DUMMYNET_FLUSH :
2059	dummynet_flush() ;
2060	break ;
2061
2062    case IP_DUMMYNET_CONFIGURE :
2063	p = &tmp_pipe ;
2064	error = sooptcopyin(sopt, p, sizeof(*p), sizeof(*p));
2065	if (error)
2066	    break ;
2067	error = config_pipe(p);
2068	break ;
2069
2070    case IP_DUMMYNET_DEL :	/* remove a pipe or queue */
2071	p = &tmp_pipe ;
2072	error = sooptcopyin(sopt, p, sizeof(*p), sizeof(*p));
2073	if (error)
2074	    break ;
2075
2076	error = delete_pipe(p);
2077	break ;
2078    }
2079    return error ;
2080}
2081
2082void
2083ip_dn_init(void)
2084{
2085	/* setup locks */
2086	dn_mutex_grp_attr = lck_grp_attr_alloc_init();
2087	dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr);
2088	dn_mutex_attr = lck_attr_alloc_init();
2089
2090	if ((dn_mutex = lck_mtx_alloc_init(dn_mutex_grp, dn_mutex_attr)) == NULL) {
2091		printf("ip_dn_init: can't alloc dn_mutex\n");
2092		return;
2093	}
2094
2095    all_pipes = NULL ;
2096    all_flow_sets = NULL ;
2097    ready_heap.size = ready_heap.elements = 0 ;
2098    ready_heap.offset = 0 ;
2099
2100    wfq_ready_heap.size = wfq_ready_heap.elements = 0 ;
2101    wfq_ready_heap.offset = 0 ;
2102
2103    extract_heap.size = extract_heap.elements = 0 ;
2104    extract_heap.offset = 0 ;
2105    ip_dn_ctl_ptr = ip_dn_ctl;
2106    ip_dn_io_ptr = dummynet_io;
2107    ip_dn_ruledel_ptr = dn_rule_delete;
2108}
2109