1/* 2 * Copyright (c) 2000-2008 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa 30 * Portions Copyright (c) 2000 Akamba Corp. 31 * All rights reserved 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 * 54 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $ 55 */ 56 57#define DUMMYNET_DEBUG 58 59/* 60 * This module implements IP dummynet, a bandwidth limiter/delay emulator 61 * used in conjunction with the ipfw package. 62 * Description of the data structures used is in ip_dummynet.h 63 * Here you mainly find the following blocks of code: 64 * + variable declarations; 65 * + heap management functions; 66 * + scheduler and dummynet functions; 67 * + configuration and initialization. 68 * 69 * NOTA BENE: critical sections are protected by the "dummynet lock". 70 * 71 * Most important Changes: 72 * 73 * 010124: Fixed WF2Q behaviour 74 * 010122: Fixed spl protection. 75 * 000601: WF2Q support 76 * 000106: large rewrite, use heaps to handle very many pipes. 77 * 980513: initial release 78 * 79 * include files marked with XXX are probably not needed 80 */ 81 82#include <sys/param.h> 83#include <sys/systm.h> 84#include <sys/malloc.h> 85#include <sys/mbuf.h> 86#include <sys/queue.h> /* XXX */ 87#include <sys/kernel.h> 88#include <sys/socket.h> 89#include <sys/socketvar.h> 90#include <sys/time.h> 91#include <sys/sysctl.h> 92#include <net/if.h> 93#include <net/route.h> 94#include <net/kpi_protocol.h> 95#include <netinet/in.h> 96#include <netinet/in_systm.h> 97#include <netinet/in_var.h> 98#include <netinet/ip.h> 99#include <netinet/ip_fw.h> 100#include <netinet/ip_dummynet.h> 101#include <netinet/ip_var.h> 102 103#if BRIDGE 104#include <netinet/if_ether.h> /* for struct arpcom */ 105#include <net/bridge.h> 106#endif 107 108/* 109 * We keep a private variable for the simulation time, but we could 110 * probably use an existing one ("softticks" in sys/kern/kern_timer.c) 111 */ 112static dn_key curr_time = 0 ; /* current simulation time */ 113 114/* this is for the timer that fires to call dummynet() - we only enable the timer when 115 there are packets to process, otherwise it's disabled */ 116static int timer_enabled = 0; 117 118static int dn_hash_size = 64 ; /* default hash size */ 119 120/* statistics on number of queue searches and search steps */ 121static int searches, search_steps ; 122static int pipe_expire = 1 ; /* expire queue if empty */ 123static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ 124 125static int red_lookup_depth = 256; /* RED - default lookup table depth */ 126static int red_avg_pkt_size = 512; /* RED - default medium packet size */ 127static int red_max_pkt_size = 1500; /* RED - default max packet size */ 128 129/* 130 * Three heaps contain queues and pipes that the scheduler handles: 131 * 132 * ready_heap contains all dn_flow_queue related to fixed-rate pipes. 133 * 134 * wfq_ready_heap contains the pipes associated with WF2Q flows 135 * 136 * extract_heap contains pipes associated with delay lines. 137 * 138 */ 139static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; 140 141static int heap_init(struct dn_heap *h, int size) ; 142static int heap_insert (struct dn_heap *h, dn_key key1, void *p); 143static void heap_extract(struct dn_heap *h, void *obj); 144 145static void transmit_event(struct dn_pipe *pipe); 146static void ready_event(struct dn_flow_queue *q); 147 148static struct dn_pipe *all_pipes = NULL ; /* list of all pipes */ 149static struct dn_flow_set *all_flow_sets = NULL ;/* list of all flow_sets */ 150 151#ifdef SYSCTL_NODE 152SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, 153 CTLFLAG_RW, 0, "Dummynet"); 154SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, 155 CTLFLAG_RW, &dn_hash_size, 0, "Default hash table size"); 156SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, curr_time, 157 CTLFLAG_RD, &curr_time, 0, "Current tick"); 158SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, 159 CTLFLAG_RD, &ready_heap.size, 0, "Size of ready heap"); 160SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, 161 CTLFLAG_RD, &extract_heap.size, 0, "Size of extract heap"); 162SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, 163 CTLFLAG_RD, &searches, 0, "Number of queue searches"); 164SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, 165 CTLFLAG_RD, &search_steps, 0, "Number of queue search steps"); 166SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, 167 CTLFLAG_RW, &pipe_expire, 0, "Expire queue if empty"); 168SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, 169 CTLFLAG_RW, &dn_max_ratio, 0, 170 "Max ratio between dynamic queues and buckets"); 171SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, 172 CTLFLAG_RD, &red_lookup_depth, 0, "Depth of RED lookup table"); 173SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, 174 CTLFLAG_RD, &red_avg_pkt_size, 0, "RED Medium packet size"); 175SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, 176 CTLFLAG_RD, &red_max_pkt_size, 0, "RED Max packet size"); 177#endif 178 179#ifdef DUMMYNET_DEBUG 180int dummynet_debug = 0; 181#ifdef SYSCTL_NODE 182SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW, &dummynet_debug, 183 0, "control debugging printfs"); 184#endif 185#define DPRINTF(X) if (dummynet_debug) printf X 186#else 187#define DPRINTF(X) 188#endif 189 190/* contrary to the comment above random(), it does not actually 191 * return a value [0, 2^31 - 1], which breaks plr amongst other 192 * things. Masking it should work even if the behavior of 193 * the function is fixed. 194 */ 195#define MY_RANDOM (random() & 0x7FFFFFFF) 196 197/* dummynet lock */ 198lck_grp_t *dn_mutex_grp; 199lck_grp_attr_t *dn_mutex_grp_attr; 200lck_attr_t *dn_mutex_attr; 201lck_mtx_t *dn_mutex; 202 203static int config_pipe(struct dn_pipe *p); 204static int ip_dn_ctl(struct sockopt *sopt); 205 206static void dummynet(void *); 207static void dummynet_flush(void); 208void dummynet_drain(void); 209static ip_dn_io_t dummynet_io; 210static void dn_rule_delete(void *); 211 212int if_tx_rdy(struct ifnet *ifp); 213 214/* 215 * Heap management functions. 216 * 217 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. 218 * Some macros help finding parent/children so we can optimize them. 219 * 220 * heap_init() is called to expand the heap when needed. 221 * Increment size in blocks of 16 entries. 222 * XXX failure to allocate a new element is a pretty bad failure 223 * as we basically stall a whole queue forever!! 224 * Returns 1 on error, 0 on success 225 */ 226#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) 227#define HEAP_LEFT(x) ( 2*(x) + 1 ) 228#define HEAP_IS_LEFT(x) ( (x) & 1 ) 229#define HEAP_RIGHT(x) ( 2*(x) + 2 ) 230#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } 231#define HEAP_INCREMENT 15 232 233static int 234heap_init(struct dn_heap *h, int new_size) 235{ 236 struct dn_heap_entry *p; 237 238 if (h->size >= new_size ) { 239 printf("dummynet: heap_init, Bogus call, have %d want %d\n", 240 h->size, new_size); 241 return 0 ; 242 } 243 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; 244 p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT ); 245 if (p == NULL) { 246 printf("dummynet: heap_init, resize %d failed\n", new_size ); 247 return 1 ; /* error */ 248 } 249 if (h->size > 0) { 250 bcopy(h->p, p, h->size * sizeof(*p) ); 251 FREE(h->p, M_DUMMYNET); 252 } 253 h->p = p ; 254 h->size = new_size ; 255 return 0 ; 256} 257 258/* 259 * Insert element in heap. Normally, p != NULL, we insert p in 260 * a new position and bubble up. If p == NULL, then the element is 261 * already in place, and key is the position where to start the 262 * bubble-up. 263 * Returns 1 on failure (cannot allocate new heap entry) 264 * 265 * If offset > 0 the position (index, int) of the element in the heap is 266 * also stored in the element itself at the given offset in bytes. 267 */ 268#define SET_OFFSET(heap, node) \ 269 if (heap->offset > 0) \ 270 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; 271/* 272 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. 273 */ 274#define RESET_OFFSET(heap, node) \ 275 if (heap->offset > 0) \ 276 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; 277static int 278heap_insert(struct dn_heap *h, dn_key key1, void *p) 279{ 280 int son = h->elements ; 281 282 if (p == NULL) /* data already there, set starting point */ 283 son = key1 ; 284 else { /* insert new element at the end, possibly resize */ 285 son = h->elements ; 286 if (son == h->size) /* need resize... */ 287 if (heap_init(h, h->elements+1) ) 288 return 1 ; /* failure... */ 289 h->p[son].object = p ; 290 h->p[son].key = key1 ; 291 h->elements++ ; 292 } 293 while (son > 0) { /* bubble up */ 294 int father = HEAP_FATHER(son) ; 295 struct dn_heap_entry tmp ; 296 297 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) 298 break ; /* found right position */ 299 /* son smaller than father, swap and repeat */ 300 HEAP_SWAP(h->p[son], h->p[father], tmp) ; 301 SET_OFFSET(h, son); 302 son = father ; 303 } 304 SET_OFFSET(h, son); 305 return 0 ; 306} 307 308/* 309 * remove top element from heap, or obj if obj != NULL 310 */ 311static void 312heap_extract(struct dn_heap *h, void *obj) 313{ 314 int child, father, maxelt = h->elements - 1 ; 315 316 if (maxelt < 0) { 317 printf("dummynet: warning, extract from empty heap 0x%p\n", h); 318 return ; 319 } 320 father = 0 ; /* default: move up smallest child */ 321 if (obj != NULL) { /* extract specific element, index is at offset */ 322 if (h->offset <= 0) 323 panic("dummynet: heap_extract from middle not supported on this heap!!!\n"); 324 father = *((int *)((char *)obj + h->offset)) ; 325 if (father < 0 || father >= h->elements) { 326 printf("dummynet: heap_extract, father %d out of bound 0..%d\n", 327 father, h->elements); 328 panic("dummynet: heap_extract"); 329 } 330 } 331 RESET_OFFSET(h, father); 332 child = HEAP_LEFT(father) ; /* left child */ 333 while (child <= maxelt) { /* valid entry */ 334 if (child != maxelt && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) 335 child = child+1 ; /* take right child, otherwise left */ 336 h->p[father] = h->p[child] ; 337 SET_OFFSET(h, father); 338 father = child ; 339 child = HEAP_LEFT(child) ; /* left child for next loop */ 340 } 341 h->elements-- ; 342 if (father != maxelt) { 343 /* 344 * Fill hole with last entry and bubble up, reusing the insert code 345 */ 346 h->p[father] = h->p[maxelt] ; 347 heap_insert(h, father, NULL); /* this one cannot fail */ 348 } 349} 350 351#if 0 352/* 353 * change object position and update references 354 * XXX this one is never used! 355 */ 356static void 357heap_move(struct dn_heap *h, dn_key new_key, void *object) 358{ 359 int temp; 360 int i ; 361 int maxelt = h->elements-1 ; 362 struct dn_heap_entry buf ; 363 364 if (h->offset <= 0) 365 panic("cannot move items on this heap"); 366 367 i = *((int *)((char *)object + h->offset)); 368 if (DN_KEY_LT(new_key, h->p[i].key) ) { /* must move up */ 369 h->p[i].key = new_key ; 370 for (; i>0 && DN_KEY_LT(new_key, h->p[(temp = HEAP_FATHER(i))].key) ; 371 i = temp ) { /* bubble up */ 372 HEAP_SWAP(h->p[i], h->p[temp], buf) ; 373 SET_OFFSET(h, i); 374 } 375 } else { /* must move down */ 376 h->p[i].key = new_key ; 377 while ( (temp = HEAP_LEFT(i)) <= maxelt ) { /* found left child */ 378 if ((temp != maxelt) && DN_KEY_GT(h->p[temp].key, h->p[temp+1].key)) 379 temp++ ; /* select child with min key */ 380 if (DN_KEY_GT(new_key, h->p[temp].key)) { /* go down */ 381 HEAP_SWAP(h->p[i], h->p[temp], buf) ; 382 SET_OFFSET(h, i); 383 } else 384 break ; 385 i = temp ; 386 } 387 } 388 SET_OFFSET(h, i); 389} 390#endif /* heap_move, unused */ 391 392/* 393 * heapify() will reorganize data inside an array to maintain the 394 * heap property. It is needed when we delete a bunch of entries. 395 */ 396static void 397heapify(struct dn_heap *h) 398{ 399 int i ; 400 401 for (i = 0 ; i < h->elements ; i++ ) 402 heap_insert(h, i , NULL) ; 403} 404 405/* 406 * cleanup the heap and free data structure 407 */ 408static void 409heap_free(struct dn_heap *h) 410{ 411 if (h->size >0 ) 412 FREE(h->p, M_DUMMYNET); 413 bzero(h, sizeof(*h)); 414} 415 416/* 417 * --- end of heap management functions --- 418 */ 419 420/* 421 * Return the mbuf tag holding the dummynet state. As an optimization 422 * this is assumed to be the first tag on the list. If this turns out 423 * wrong we'll need to search the list. 424 */ 425static struct dn_pkt_tag * 426dn_tag_get(struct mbuf *m) 427{ 428 struct m_tag *mtag = m_tag_first(m); 429/* KASSERT(mtag != NULL && 430 mtag->m_tag_id == KERNEL_MODULE_TAG_ID && 431 mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET, 432 ("packet on dummynet queue w/o dummynet tag!")); 433*/ 434 return (struct dn_pkt_tag *)(mtag+1); 435} 436 437/* 438 * Scheduler functions: 439 * 440 * transmit_event() is called when the delay-line needs to enter 441 * the scheduler, either because of existing pkts getting ready, 442 * or new packets entering the queue. The event handled is the delivery 443 * time of the packet. 444 * 445 * ready_event() does something similar with fixed-rate queues, and the 446 * event handled is the finish time of the head pkt. 447 * 448 * wfq_ready_event() does something similar with WF2Q queues, and the 449 * event handled is the start time of the head pkt. 450 * 451 * In all cases, we make sure that the data structures are consistent 452 * before passing pkts out, because this might trigger recursive 453 * invocations of the procedures. 454 */ 455static void 456transmit_event(struct dn_pipe *pipe) 457{ 458 struct mbuf *m ; 459 struct dn_pkt_tag *pkt ; 460 461 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 462 463 while ( (m = pipe->head) ) { 464 pkt = dn_tag_get(m); 465 if ( !DN_KEY_LEQ(pkt->output_time, curr_time) ) 466 break; 467 /* 468 * first unlink, then call procedures, since ip_input() can invoke 469 * ip_output() and viceversa, thus causing nested calls 470 */ 471 pipe->head = m->m_nextpkt ; 472 m->m_nextpkt = NULL; 473 474 /* XXX: drop the lock for now to avoid LOR's */ 475 lck_mtx_unlock(dn_mutex); 476 switch (pkt->dn_dir) { 477 case DN_TO_IP_OUT: { 478 struct route tmp_rt = pkt->ro; 479 (void)ip_output(m, NULL, NULL, pkt->flags, NULL, NULL); 480 if (tmp_rt.ro_rt) { 481 rtfree(tmp_rt.ro_rt); 482 tmp_rt.ro_rt = NULL; 483 } 484 break ; 485 } 486 case DN_TO_IP_IN : 487 proto_inject(PF_INET, m); 488 break ; 489 490#if BRIDGE 491 case DN_TO_BDG_FWD : 492 /* 493 * The bridge requires/assumes the Ethernet header is 494 * contiguous in the first mbuf header. Insure this is true. 495 */ 496 if (BDG_LOADED) { 497 if (m->m_len < ETHER_HDR_LEN && 498 (m = m_pullup(m, ETHER_HDR_LEN)) == NULL) { 499 printf("dummynet/bridge: pullup fail, dropping pkt\n"); 500 break; 501 } 502 m = bdg_forward_ptr(m, pkt->ifp); 503 } else { 504 /* somebody unloaded the bridge module. Drop pkt */ 505 /* XXX rate limit */ 506 printf("dummynet: dropping bridged packet trapped in pipe\n"); 507 } 508 if (m) 509 m_freem(m); 510 break; 511#endif 512 default: 513 printf("dummynet: bad switch %d!\n", pkt->dn_dir); 514 m_freem(m); 515 break ; 516 } 517 lck_mtx_lock(dn_mutex); 518 } 519 /* if there are leftover packets, put into the heap for next event */ 520 if ( (m = pipe->head) ) { 521 pkt = dn_tag_get(m); 522 /* XXX should check errors on heap_insert, by draining the 523 * whole pipe p and hoping in the future we are more successful 524 */ 525 heap_insert(&extract_heap, pkt->output_time, pipe); 526 } 527} 528 529/* 530 * the following macro computes how many ticks we have to wait 531 * before being able to transmit a packet. The credit is taken from 532 * either a pipe (WF2Q) or a flow_queue (per-flow queueing) 533 */ 534 535/* hz is 100, which gives a granularity of 10ms in the old timer. 536 * The timer has been changed to fire every 1ms, so the use of 537 * hz has been modified here. All instances of hz have been left 538 * in place but adjusted by a factor of 10 so that hz is functionally 539 * equal to 1000. 540 */ 541#define SET_TICKS(_m, q, p) \ 542 ((_m)->m_pkthdr.len*8*(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \ 543 p->bandwidth ; 544 545/* 546 * extract pkt from queue, compute output time (could be now) 547 * and put into delay line (p_queue) 548 */ 549static void 550move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, 551 struct dn_pipe *p, int len) 552{ 553 struct dn_pkt_tag *dt = dn_tag_get(pkt); 554 555 q->head = pkt->m_nextpkt ; 556 q->len-- ; 557 q->len_bytes -= len ; 558 559 dt->output_time = curr_time + p->delay ; 560 561 if (p->head == NULL) 562 p->head = pkt; 563 else 564 p->tail->m_nextpkt = pkt; 565 p->tail = pkt; 566 p->tail->m_nextpkt = NULL; 567} 568 569/* 570 * ready_event() is invoked every time the queue must enter the 571 * scheduler, either because the first packet arrives, or because 572 * a previously scheduled event fired. 573 * On invokation, drain as many pkts as possible (could be 0) and then 574 * if there are leftover packets reinsert the pkt in the scheduler. 575 */ 576static void 577ready_event(struct dn_flow_queue *q) 578{ 579 struct mbuf *pkt; 580 struct dn_pipe *p = q->fs->pipe ; 581 int p_was_empty ; 582 583 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 584 585 if (p == NULL) { 586 printf("dummynet: ready_event- pipe is gone\n"); 587 return ; 588 } 589 p_was_empty = (p->head == NULL) ; 590 591 /* 592 * schedule fixed-rate queues linked to this pipe: 593 * Account for the bw accumulated since last scheduling, then 594 * drain as many pkts as allowed by q->numbytes and move to 595 * the delay line (in p) computing output time. 596 * bandwidth==0 (no limit) means we can drain the whole queue, 597 * setting len_scaled = 0 does the job. 598 */ 599 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; 600 while ( (pkt = q->head) != NULL ) { 601 int len = pkt->m_pkthdr.len; 602 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ; 603 if (len_scaled > q->numbytes ) 604 break ; 605 q->numbytes -= len_scaled ; 606 move_pkt(pkt, q, p, len); 607 } 608 /* 609 * If we have more packets queued, schedule next ready event 610 * (can only occur when bandwidth != 0, otherwise we would have 611 * flushed the whole queue in the previous loop). 612 * To this purpose we record the current time and compute how many 613 * ticks to go for the finish time of the packet. 614 */ 615 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ 616 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ 617 q->sched_time = curr_time ; 618 heap_insert(&ready_heap, curr_time + t, (void *)q ); 619 /* XXX should check errors on heap_insert, and drain the whole 620 * queue on error hoping next time we are luckier. 621 */ 622 } else { /* RED needs to know when the queue becomes empty */ 623 q->q_time = curr_time; 624 q->numbytes = 0; 625 } 626 /* 627 * If the delay line was empty call transmit_event(p) now. 628 * Otherwise, the scheduler will take care of it. 629 */ 630 if (p_was_empty) 631 transmit_event(p); 632} 633 634/* 635 * Called when we can transmit packets on WF2Q queues. Take pkts out of 636 * the queues at their start time, and enqueue into the delay line. 637 * Packets are drained until p->numbytes < 0. As long as 638 * len_scaled >= p->numbytes, the packet goes into the delay line 639 * with a deadline p->delay. For the last packet, if p->numbytes<0, 640 * there is an additional delay. 641 */ 642static void 643ready_event_wfq(struct dn_pipe *p) 644{ 645 int p_was_empty = (p->head == NULL) ; 646 struct dn_heap *sch = &(p->scheduler_heap); 647 struct dn_heap *neh = &(p->not_eligible_heap) ; 648 649 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 650 651 if (p->if_name[0] == 0) /* tx clock is simulated */ 652 p->numbytes += ( curr_time - p->sched_time ) * p->bandwidth; 653 else { /* tx clock is for real, the ifq must be empty or this is a NOP */ 654 if (p->ifp && p->ifp->if_snd.ifq_head != NULL) 655 return ; 656 else { 657 DPRINTF(("dummynet: pipe %d ready from %s --\n", 658 p->pipe_nr, p->if_name)); 659 } 660 } 661 662 /* 663 * While we have backlogged traffic AND credit, we need to do 664 * something on the queue. 665 */ 666 while ( p->numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { 667 if (sch->elements > 0) { /* have some eligible pkts to send out */ 668 struct dn_flow_queue *q = sch->p[0].object ; 669 struct mbuf *pkt = q->head; 670 struct dn_flow_set *fs = q->fs; 671 u_int64_t len = pkt->m_pkthdr.len; 672 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ; 673 674 heap_extract(sch, NULL); /* remove queue from heap */ 675 p->numbytes -= len_scaled ; 676 move_pkt(pkt, q, p, len); 677 678 p->V += (len<<MY_M) / p->sum ; /* update V */ 679 q->S = q->F ; /* update start time */ 680 if (q->len == 0) { /* Flow not backlogged any more */ 681 fs->backlogged-- ; 682 heap_insert(&(p->idle_heap), q->F, q); 683 } else { /* still backlogged */ 684 /* 685 * update F and position in backlogged queue, then 686 * put flow in not_eligible_heap (we will fix this later). 687 */ 688 len = (q->head)->m_pkthdr.len; 689 q->F += (len<<MY_M)/(u_int64_t) fs->weight ; 690 if (DN_KEY_LEQ(q->S, p->V)) 691 heap_insert(neh, q->S, q); 692 else 693 heap_insert(sch, q->F, q); 694 } 695 } 696 /* 697 * now compute V = max(V, min(S_i)). Remember that all elements in sch 698 * have by definition S_i <= V so if sch is not empty, V is surely 699 * the max and we must not update it. Conversely, if sch is empty 700 * we only need to look at neh. 701 */ 702 if (sch->elements == 0 && neh->elements > 0) 703 p->V = MAX64 ( p->V, neh->p[0].key ); 704 /* move from neh to sch any packets that have become eligible */ 705 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { 706 struct dn_flow_queue *q = neh->p[0].object ; 707 heap_extract(neh, NULL); 708 heap_insert(sch, q->F, q); 709 } 710 711 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ 712 p->numbytes = -1 ; /* mark not ready for I/O */ 713 break ; 714 } 715 } 716 if (sch->elements == 0 && neh->elements == 0 && p->numbytes >= 0 717 && p->idle_heap.elements > 0) { 718 /* 719 * no traffic and no events scheduled. We can get rid of idle-heap. 720 */ 721 int i ; 722 723 for (i = 0 ; i < p->idle_heap.elements ; i++) { 724 struct dn_flow_queue *q = p->idle_heap.p[i].object ; 725 726 q->F = 0 ; 727 q->S = q->F + 1 ; 728 } 729 p->sum = 0 ; 730 p->V = 0 ; 731 p->idle_heap.elements = 0 ; 732 } 733 /* 734 * If we are getting clocks from dummynet (not a real interface) and 735 * If we are under credit, schedule the next ready event. 736 * Also fix the delivery time of the last packet. 737 */ 738 if (p->if_name[0]==0 && p->numbytes < 0) { /* this implies bandwidth >0 */ 739 dn_key t=0 ; /* number of ticks i have to wait */ 740 741 if (p->bandwidth > 0) 742 t = ( p->bandwidth -1 - p->numbytes) / p->bandwidth ; 743 dn_tag_get(p->tail)->output_time += t ; 744 p->sched_time = curr_time ; 745 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); 746 /* XXX should check errors on heap_insert, and drain the whole 747 * queue on error hoping next time we are luckier. 748 */ 749 } 750 /* 751 * If the delay line was empty call transmit_event(p) now. 752 * Otherwise, the scheduler will take care of it. 753 */ 754 if (p_was_empty) 755 transmit_event(p); 756} 757 758/* 759 * This is called every 1ms. It is used to 760 * increment the current tick counter and schedule expired events. 761 */ 762static void 763dummynet(__unused void * unused) 764{ 765 void *p ; /* generic parameter to handler */ 766 struct dn_heap *h ; 767 struct dn_heap *heaps[3]; 768 int i; 769 struct dn_pipe *pe ; 770 struct timespec ts; 771 struct timeval tv; 772 773 heaps[0] = &ready_heap ; /* fixed-rate queues */ 774 heaps[1] = &wfq_ready_heap ; /* wfq queues */ 775 heaps[2] = &extract_heap ; /* delay line */ 776 777 lck_mtx_lock(dn_mutex); 778 779 /* make all time measurements in milliseconds (ms) - 780 * here we convert secs and usecs to msecs (just divide the 781 * usecs and take the closest whole number). 782 */ 783 microuptime(&tv); 784 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 785 786 for (i=0; i < 3 ; i++) { 787 h = heaps[i]; 788 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { 789 if (h->p[0].key > curr_time) 790 printf("dummynet: warning, heap %d is %d ticks late\n", 791 i, (int)(curr_time - h->p[0].key)); 792 p = h->p[0].object ; /* store a copy before heap_extract */ 793 heap_extract(h, NULL); /* need to extract before processing */ 794 if (i == 0) 795 ready_event(p) ; 796 else if (i == 1) { 797 struct dn_pipe *pipe = p; 798 if (pipe->if_name[0] != '\0') 799 printf("dummynet: bad ready_event_wfq for pipe %s\n", 800 pipe->if_name); 801 else 802 ready_event_wfq(p) ; 803 } else 804 transmit_event(p); 805 } 806 } 807 /* sweep pipes trying to expire idle flow_queues */ 808 for (pe = all_pipes; pe ; pe = pe->next ) 809 if (pe->idle_heap.elements > 0 && 810 DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) { 811 struct dn_flow_queue *q = pe->idle_heap.p[0].object ; 812 813 heap_extract(&(pe->idle_heap), NULL); 814 q->S = q->F + 1 ; /* mark timestamp as invalid */ 815 pe->sum -= q->fs->weight ; 816 } 817 818 /* check the heaps to see if there's still stuff in there, and 819 * only set the timer if there are packets to process 820 */ 821 timer_enabled = 0; 822 for (i=0; i < 3 ; i++) { 823 h = heaps[i]; 824 if (h->elements > 0) { // set the timer 825 ts.tv_sec = 0; 826 ts.tv_nsec = 1 * 1000000; // 1ms 827 timer_enabled = 1; 828 bsd_timeout(dummynet, NULL, &ts); 829 break; 830 } 831 } 832 833 lck_mtx_unlock(dn_mutex); 834} 835 836/* 837 * called by an interface when tx_rdy occurs. 838 */ 839int 840if_tx_rdy(struct ifnet *ifp) 841{ 842 struct dn_pipe *p; 843 844 lck_mtx_lock(dn_mutex); 845 for (p = all_pipes; p ; p = p->next ) 846 if (p->ifp == ifp) 847 break ; 848 if (p == NULL) { 849 char buf[32]; 850 snprintf(buf, sizeof(buf), "%s%d",ifp->if_name, ifp->if_unit); 851 for (p = all_pipes; p ; p = p->next ) 852 if (!strcmp(p->if_name, buf) ) { 853 p->ifp = ifp ; 854 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf)); 855 break ; 856 } 857 } 858 if (p != NULL) { 859 DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name, 860 ifp->if_unit, ifp->if_snd.ifq_len)); 861 p->numbytes = 0 ; /* mark ready for I/O */ 862 ready_event_wfq(p); 863 } 864 lck_mtx_lock(dn_mutex); 865 866 return 0; 867} 868 869/* 870 * Unconditionally expire empty queues in case of shortage. 871 * Returns the number of queues freed. 872 */ 873static int 874expire_queues(struct dn_flow_set *fs) 875{ 876 struct dn_flow_queue *q, *prev ; 877 int i, initial_elements = fs->rq_elements ; 878 struct timeval timenow; 879 880 getmicrotime(&timenow); 881 882 if (fs->last_expired == timenow.tv_sec) 883 return 0 ; 884 fs->last_expired = timenow.tv_sec ; 885 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ 886 for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) 887 if (q->head != NULL || q->S != q->F+1) { 888 prev = q ; 889 q = q->next ; 890 } else { /* entry is idle, expire it */ 891 struct dn_flow_queue *old_q = q ; 892 893 if (prev != NULL) 894 prev->next = q = q->next ; 895 else 896 fs->rq[i] = q = q->next ; 897 fs->rq_elements-- ; 898 FREE(old_q, M_DUMMYNET); 899 } 900 return initial_elements - fs->rq_elements ; 901} 902 903/* 904 * If room, create a new queue and put at head of slot i; 905 * otherwise, create or use the default queue. 906 */ 907static struct dn_flow_queue * 908create_queue(struct dn_flow_set *fs, int i) 909{ 910 struct dn_flow_queue *q ; 911 912 if (fs->rq_elements > fs->rq_size * dn_max_ratio && 913 expire_queues(fs) == 0) { 914 /* 915 * No way to get room, use or create overflow queue. 916 */ 917 i = fs->rq_size ; 918 if ( fs->rq[i] != NULL ) 919 return fs->rq[i] ; 920 } 921 q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO); 922 if (q == NULL) { 923 printf("dummynet: sorry, cannot allocate queue for new flow\n"); 924 return NULL ; 925 } 926 q->fs = fs ; 927 q->hash_slot = i ; 928 q->next = fs->rq[i] ; 929 q->S = q->F + 1; /* hack - mark timestamp as invalid */ 930 fs->rq[i] = q ; 931 fs->rq_elements++ ; 932 return q ; 933} 934 935/* 936 * Given a flow_set and a pkt in last_pkt, find a matching queue 937 * after appropriate masking. The queue is moved to front 938 * so that further searches take less time. 939 */ 940static struct dn_flow_queue * 941find_queue(struct dn_flow_set *fs, struct ipfw_flow_id *id) 942{ 943 int i = 0 ; /* we need i and q for new allocations */ 944 struct dn_flow_queue *q, *prev; 945 946 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) 947 q = fs->rq[0] ; 948 else { 949 /* first, do the masking */ 950 id->dst_ip &= fs->flow_mask.dst_ip ; 951 id->src_ip &= fs->flow_mask.src_ip ; 952 id->dst_port &= fs->flow_mask.dst_port ; 953 id->src_port &= fs->flow_mask.src_port ; 954 id->proto &= fs->flow_mask.proto ; 955 id->flags = 0 ; /* we don't care about this one */ 956 /* then, hash function */ 957 i = ( (id->dst_ip) & 0xffff ) ^ 958 ( (id->dst_ip >> 15) & 0xffff ) ^ 959 ( (id->src_ip << 1) & 0xffff ) ^ 960 ( (id->src_ip >> 16 ) & 0xffff ) ^ 961 (id->dst_port << 1) ^ (id->src_port) ^ 962 (id->proto ); 963 i = i % fs->rq_size ; 964 /* finally, scan the current list for a match */ 965 searches++ ; 966 for (prev=NULL, q = fs->rq[i] ; q ; ) { 967 search_steps++; 968 if (id->dst_ip == q->id.dst_ip && 969 id->src_ip == q->id.src_ip && 970 id->dst_port == q->id.dst_port && 971 id->src_port == q->id.src_port && 972 id->proto == q->id.proto && 973 id->flags == q->id.flags) 974 break ; /* found */ 975 else if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { 976 /* entry is idle and not in any heap, expire it */ 977 struct dn_flow_queue *old_q = q ; 978 979 if (prev != NULL) 980 prev->next = q = q->next ; 981 else 982 fs->rq[i] = q = q->next ; 983 fs->rq_elements-- ; 984 FREE(old_q, M_DUMMYNET); 985 continue ; 986 } 987 prev = q ; 988 q = q->next ; 989 } 990 if (q && prev != NULL) { /* found and not in front */ 991 prev->next = q->next ; 992 q->next = fs->rq[i] ; 993 fs->rq[i] = q ; 994 } 995 } 996 if (q == NULL) { /* no match, need to allocate a new entry */ 997 q = create_queue(fs, i); 998 if (q != NULL) 999 q->id = *id ; 1000 } 1001 return q ; 1002} 1003 1004static int 1005red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) 1006{ 1007 /* 1008 * RED algorithm 1009 * 1010 * RED calculates the average queue size (avg) using a low-pass filter 1011 * with an exponential weighted (w_q) moving average: 1012 * avg <- (1-w_q) * avg + w_q * q_size 1013 * where q_size is the queue length (measured in bytes or * packets). 1014 * 1015 * If q_size == 0, we compute the idle time for the link, and set 1016 * avg = (1 - w_q)^(idle/s) 1017 * where s is the time needed for transmitting a medium-sized packet. 1018 * 1019 * Now, if avg < min_th the packet is enqueued. 1020 * If avg > max_th the packet is dropped. Otherwise, the packet is 1021 * dropped with probability P function of avg. 1022 * 1023 */ 1024 1025 int64_t p_b = 0; 1026 /* queue in bytes or packets ? */ 1027 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; 1028 1029 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size)); 1030 1031 /* average queue size estimation */ 1032 if (q_size != 0) { 1033 /* 1034 * queue is not empty, avg <- avg + (q_size - avg) * w_q 1035 */ 1036 int diff = SCALE(q_size) - q->avg; 1037 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); 1038 1039 q->avg += (int) v; 1040 } else { 1041 /* 1042 * queue is empty, find for how long the queue has been 1043 * empty and use a lookup table for computing 1044 * (1 - * w_q)^(idle_time/s) where s is the time to send a 1045 * (small) packet. 1046 * XXX check wraps... 1047 */ 1048 if (q->avg) { 1049 u_int t = (curr_time - q->q_time) / fs->lookup_step; 1050 1051 q->avg = (t < fs->lookup_depth) ? 1052 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; 1053 } 1054 } 1055 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg))); 1056 1057 /* should i drop ? */ 1058 1059 if (q->avg < fs->min_th) { 1060 q->count = -1; 1061 return 0; /* accept packet ; */ 1062 } 1063 if (q->avg >= fs->max_th) { /* average queue >= max threshold */ 1064 if (fs->flags_fs & DN_IS_GENTLE_RED) { 1065 /* 1066 * According to Gentle-RED, if avg is greater than max_th the 1067 * packet is dropped with a probability 1068 * p_b = c_3 * avg - c_4 1069 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p 1070 */ 1071 p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; 1072 } else { 1073 q->count = -1; 1074 DPRINTF(("dummynet: - drop")); 1075 return 1 ; 1076 } 1077 } else if (q->avg > fs->min_th) { 1078 /* 1079 * we compute p_b using the linear dropping function p_b = c_1 * 1080 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = 1081 * max_p * min_th / (max_th - min_th) 1082 */ 1083 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; 1084 } 1085 if (fs->flags_fs & DN_QSIZE_IS_BYTES) 1086 p_b = (p_b * len) / fs->max_pkt_size; 1087 if (++q->count == 0) 1088 q->random = MY_RANDOM & 0xffff; 1089 else { 1090 /* 1091 * q->count counts packets arrived since last drop, so a greater 1092 * value of q->count means a greater packet drop probability. 1093 */ 1094 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { 1095 q->count = 0; 1096 DPRINTF(("dummynet: - red drop")); 1097 /* after a drop we calculate a new random value */ 1098 q->random = MY_RANDOM & 0xffff; 1099 return 1; /* drop */ 1100 } 1101 } 1102 /* end of RED algorithm */ 1103 return 0 ; /* accept */ 1104} 1105 1106static __inline 1107struct dn_flow_set * 1108locate_flowset(int pipe_nr, struct ip_fw *rule) 1109{ 1110 struct dn_flow_set *fs; 1111 ipfw_insn *cmd = rule->cmd + rule->act_ofs; 1112 1113 if (cmd->opcode == O_LOG) 1114 cmd += F_LEN(cmd); 1115 1116 bcopy(& ((ipfw_insn_pipe *)cmd)->pipe_ptr, &fs, sizeof(fs)); 1117 1118 if (fs != NULL) 1119 return fs; 1120 1121 if (cmd->opcode == O_QUEUE) { 1122 for (fs=all_flow_sets; fs && fs->fs_nr != pipe_nr; fs=fs->next) 1123 ; 1124 } 1125 else { 1126 struct dn_pipe *p1; 1127 for (p1 = all_pipes; p1 && p1->pipe_nr != pipe_nr; p1 = p1->next) 1128 ; 1129 if (p1 != NULL) 1130 fs = &(p1->fs) ; 1131 } 1132 /* record for the future */ 1133 bcopy(&fs, & ((ipfw_insn_pipe *)cmd)->pipe_ptr, sizeof(fs)); 1134 1135 return fs ; 1136} 1137 1138/* 1139 * dummynet hook for packets. Below 'pipe' is a pipe or a queue 1140 * depending on whether WF2Q or fixed bw is used. 1141 * 1142 * pipe_nr pipe or queue the packet is destined for. 1143 * dir where shall we send the packet after dummynet. 1144 * m the mbuf with the packet 1145 * ifp the 'ifp' parameter from the caller. 1146 * NULL in ip_input, destination interface in ip_output, 1147 * real_dst in bdg_forward 1148 * ro route parameter (only used in ip_output, NULL otherwise) 1149 * dst destination address, only used by ip_output 1150 * rule matching rule, in case of multiple passes 1151 * flags flags from the caller, only used in ip_output 1152 * 1153 */ 1154static int 1155dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa) 1156{ 1157 struct dn_pkt_tag *pkt; 1158 struct m_tag *mtag; 1159 struct dn_flow_set *fs; 1160 struct dn_pipe *pipe ; 1161 u_int64_t len = m->m_pkthdr.len ; 1162 struct dn_flow_queue *q = NULL ; 1163 int is_pipe; 1164 struct timespec ts; 1165 struct timeval tv; 1166 1167#if IPFW2 1168 ipfw_insn *cmd = fwa->rule->cmd + fwa->rule->act_ofs; 1169 1170 if (cmd->opcode == O_LOG) 1171 cmd += F_LEN(cmd); 1172 is_pipe = (cmd->opcode == O_PIPE); 1173#else 1174 is_pipe = (fwa->rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE; 1175#endif 1176 1177 pipe_nr &= 0xffff ; 1178 1179 lck_mtx_lock(dn_mutex); 1180 1181 /* make all time measurements in milliseconds (ms) - 1182 * here we convert secs and usecs to msecs (just divide the 1183 * usecs and take the closest whole number). 1184 */ 1185 microuptime(&tv); 1186 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 1187 1188 /* 1189 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. 1190 */ 1191 fs = locate_flowset(pipe_nr, fwa->rule); 1192 if (fs == NULL) 1193 goto dropit ; /* this queue/pipe does not exist! */ 1194 pipe = fs->pipe ; 1195 if (pipe == NULL) { /* must be a queue, try find a matching pipe */ 1196 for (pipe = all_pipes; pipe && pipe->pipe_nr != fs->parent_nr; 1197 pipe = pipe->next) 1198 ; 1199 if (pipe != NULL) 1200 fs->pipe = pipe ; 1201 else { 1202 printf("dummynet: no pipe %d for queue %d, drop pkt\n", 1203 fs->parent_nr, fs->fs_nr); 1204 goto dropit ; 1205 } 1206 } 1207 q = find_queue(fs, &(fwa->f_id)); 1208 if ( q == NULL ) 1209 goto dropit ; /* cannot allocate queue */ 1210 /* 1211 * update statistics, then check reasons to drop pkt 1212 */ 1213 q->tot_bytes += len ; 1214 q->tot_pkts++ ; 1215 if ( fs->plr && (MY_RANDOM < fs->plr) ) 1216 goto dropit ; /* random pkt drop */ 1217 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { 1218 if (q->len_bytes > fs->qsize) 1219 goto dropit ; /* queue size overflow */ 1220 } else { 1221 if (q->len >= fs->qsize) 1222 goto dropit ; /* queue count overflow */ 1223 } 1224 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) 1225 goto dropit ; 1226 1227 /* XXX expensive to zero, see if we can remove it*/ 1228 mtag = m_tag_alloc(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, 1229 sizeof(struct dn_pkt_tag), M_NOWAIT); 1230 if ( mtag == NULL ) 1231 goto dropit ; /* cannot allocate packet header */ 1232 m_tag_prepend(m, mtag); /* attach to mbuf chain */ 1233 1234 pkt = (struct dn_pkt_tag *)(mtag+1); 1235 bzero(pkt, sizeof(struct dn_pkt_tag)); 1236 /* ok, i can handle the pkt now... */ 1237 /* build and enqueue packet + parameters */ 1238 pkt->rule = fwa->rule ; 1239 pkt->dn_dir = dir ; 1240 1241 pkt->ifp = fwa->oif; 1242 if (dir == DN_TO_IP_OUT) { 1243 /* 1244 * We need to copy *ro because for ICMP pkts (and maybe others) 1245 * the caller passed a pointer into the stack; dst might also be 1246 * a pointer into *ro so it needs to be updated. 1247 */ 1248 lck_mtx_lock(rt_mtx); 1249 pkt->ro = *(fwa->ro); 1250 if (fwa->ro->ro_rt) 1251 rtref(fwa->ro->ro_rt); 1252 if (fwa->dst == (struct sockaddr_in *)&fwa->ro->ro_dst) /* dst points into ro */ 1253 fwa->dst = (struct sockaddr_in *)&(pkt->ro.ro_dst) ; 1254 lck_mtx_unlock(rt_mtx); 1255 1256 pkt->dn_dst = fwa->dst; 1257 pkt->flags = fwa->flags; 1258 if (fwa->ipoa != NULL) 1259 pkt->ipoa = *(fwa->ipoa); 1260 } 1261 if (q->head == NULL) 1262 q->head = m; 1263 else 1264 q->tail->m_nextpkt = m; 1265 q->tail = m; 1266 q->len++; 1267 q->len_bytes += len ; 1268 1269 if ( q->head != m ) /* flow was not idle, we are done */ 1270 goto done; 1271 /* 1272 * If we reach this point the flow was previously idle, so we need 1273 * to schedule it. This involves different actions for fixed-rate or 1274 * WF2Q queues. 1275 */ 1276 if (is_pipe) { 1277 /* 1278 * Fixed-rate queue: just insert into the ready_heap. 1279 */ 1280 dn_key t = 0 ; 1281 if (pipe->bandwidth) 1282 t = SET_TICKS(m, q, pipe); 1283 q->sched_time = curr_time ; 1284 if (t == 0) /* must process it now */ 1285 ready_event( q ); 1286 else 1287 heap_insert(&ready_heap, curr_time + t , q ); 1288 } else { 1289 /* 1290 * WF2Q. First, compute start time S: if the flow was idle (S=F+1) 1291 * set S to the virtual time V for the controlling pipe, and update 1292 * the sum of weights for the pipe; otherwise, remove flow from 1293 * idle_heap and set S to max(F,V). 1294 * Second, compute finish time F = S + len/weight. 1295 * Third, if pipe was idle, update V=max(S, V). 1296 * Fourth, count one more backlogged flow. 1297 */ 1298 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ 1299 q->S = pipe->V ; 1300 pipe->sum += fs->weight ; /* add weight of new queue */ 1301 } else { 1302 heap_extract(&(pipe->idle_heap), q); 1303 q->S = MAX64(q->F, pipe->V ) ; 1304 } 1305 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight; 1306 1307 if (pipe->not_eligible_heap.elements == 0 && 1308 pipe->scheduler_heap.elements == 0) 1309 pipe->V = MAX64 ( q->S, pipe->V ); 1310 fs->backlogged++ ; 1311 /* 1312 * Look at eligibility. A flow is not eligibile if S>V (when 1313 * this happens, it means that there is some other flow already 1314 * scheduled for the same pipe, so the scheduler_heap cannot be 1315 * empty). If the flow is not eligible we just store it in the 1316 * not_eligible_heap. Otherwise, we store in the scheduler_heap 1317 * and possibly invoke ready_event_wfq() right now if there is 1318 * leftover credit. 1319 * Note that for all flows in scheduler_heap (SCH), S_i <= V, 1320 * and for all flows in not_eligible_heap (NEH), S_i > V . 1321 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, 1322 * we only need to look into NEH. 1323 */ 1324 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ 1325 if (pipe->scheduler_heap.elements == 0) 1326 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n"); 1327 heap_insert(&(pipe->not_eligible_heap), q->S, q); 1328 } else { 1329 heap_insert(&(pipe->scheduler_heap), q->F, q); 1330 if (pipe->numbytes >= 0) { /* pipe is idle */ 1331 if (pipe->scheduler_heap.elements != 1) 1332 printf("dummynet: OUCH! pipe should have been idle!\n"); 1333 DPRINTF(("dummynet: waking up pipe %d at %d\n", 1334 pipe->pipe_nr, (int)(q->F >> MY_M))); 1335 pipe->sched_time = curr_time ; 1336 ready_event_wfq(pipe); 1337 } 1338 } 1339 } 1340done: 1341 /* start the timer and set global if not already set */ 1342 if (!timer_enabled) { 1343 ts.tv_sec = 0; 1344 ts.tv_nsec = 1 * 1000000; // 1ms 1345 timer_enabled = 1; 1346 bsd_timeout(dummynet, NULL, &ts); 1347 } 1348 1349 lck_mtx_unlock(dn_mutex); 1350 return 0; 1351 1352dropit: 1353 if (q) 1354 q->drops++ ; 1355 lck_mtx_unlock(dn_mutex); 1356 m_freem(m); 1357 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); 1358} 1359 1360/* 1361 * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT) 1362 * Doing this would probably save us the initial bzero of dn_pkt 1363 */ 1364#define DN_FREE_PKT(_m) do { \ 1365 struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \ 1366 if (tag) { \ 1367 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \ 1368 if (n->ro.ro_rt) { \ 1369 rtfree(n->ro.ro_rt); \ 1370 n->ro.ro_rt = NULL; \ 1371 } \ 1372 } \ 1373 m_tag_delete(_m, tag); \ 1374 m_freem(_m); \ 1375} while (0) 1376 1377/* 1378 * Dispose all packets and flow_queues on a flow_set. 1379 * If all=1, also remove red lookup table and other storage, 1380 * including the descriptor itself. 1381 * For the one in dn_pipe MUST also cleanup ready_heap... 1382 */ 1383static void 1384purge_flow_set(struct dn_flow_set *fs, int all) 1385{ 1386 struct dn_flow_queue *q, *qn ; 1387 int i ; 1388 1389 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 1390 1391 for (i = 0 ; i <= fs->rq_size ; i++ ) { 1392 for (q = fs->rq[i] ; q ; q = qn ) { 1393 struct mbuf *m, *mnext; 1394 1395 mnext = q->head; 1396 while ((m = mnext) != NULL) { 1397 mnext = m->m_nextpkt; 1398 DN_FREE_PKT(m); 1399 } 1400 qn = q->next ; 1401 FREE(q, M_DUMMYNET); 1402 } 1403 fs->rq[i] = NULL ; 1404 } 1405 fs->rq_elements = 0 ; 1406 if (all) { 1407 /* RED - free lookup table */ 1408 if (fs->w_q_lookup) 1409 FREE(fs->w_q_lookup, M_DUMMYNET); 1410 if (fs->rq) 1411 FREE(fs->rq, M_DUMMYNET); 1412 /* if this fs is not part of a pipe, free it */ 1413 if (fs->pipe && fs != &(fs->pipe->fs) ) 1414 FREE(fs, M_DUMMYNET); 1415 } 1416} 1417 1418/* 1419 * Dispose all packets queued on a pipe (not a flow_set). 1420 * Also free all resources associated to a pipe, which is about 1421 * to be deleted. 1422 */ 1423static void 1424purge_pipe(struct dn_pipe *pipe) 1425{ 1426 struct mbuf *m, *mnext; 1427 1428 purge_flow_set( &(pipe->fs), 1 ); 1429 1430 mnext = pipe->head; 1431 while ((m = mnext) != NULL) { 1432 mnext = m->m_nextpkt; 1433 DN_FREE_PKT(m); 1434 } 1435 1436 heap_free( &(pipe->scheduler_heap) ); 1437 heap_free( &(pipe->not_eligible_heap) ); 1438 heap_free( &(pipe->idle_heap) ); 1439} 1440 1441/* 1442 * Delete all pipes and heaps returning memory. Must also 1443 * remove references from all ipfw rules to all pipes. 1444 */ 1445static void 1446dummynet_flush(void) 1447{ 1448 struct dn_pipe *curr_p, *p ; 1449 struct dn_flow_set *fs, *curr_fs; 1450 1451 lck_mtx_lock(dn_mutex); 1452 1453 /* remove all references to pipes ...*/ 1454 flush_pipe_ptrs(NULL); 1455 /* prevent future matches... */ 1456 p = all_pipes ; 1457 all_pipes = NULL ; 1458 fs = all_flow_sets ; 1459 all_flow_sets = NULL ; 1460 /* and free heaps so we don't have unwanted events */ 1461 heap_free(&ready_heap); 1462 heap_free(&wfq_ready_heap); 1463 heap_free(&extract_heap); 1464 1465 /* 1466 * Now purge all queued pkts and delete all pipes 1467 */ 1468 /* scan and purge all flow_sets. */ 1469 for ( ; fs ; ) { 1470 curr_fs = fs ; 1471 fs = fs->next ; 1472 purge_flow_set(curr_fs, 1); 1473 } 1474 for ( ; p ; ) { 1475 purge_pipe(p); 1476 curr_p = p ; 1477 p = p->next ; 1478 FREE(curr_p, M_DUMMYNET); 1479 } 1480 lck_mtx_unlock(dn_mutex); 1481} 1482 1483 1484extern struct ip_fw *ip_fw_default_rule ; 1485static void 1486dn_rule_delete_fs(struct dn_flow_set *fs, void *r) 1487{ 1488 int i ; 1489 struct dn_flow_queue *q ; 1490 struct mbuf *m ; 1491 1492 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ 1493 for (q = fs->rq[i] ; q ; q = q->next ) 1494 for (m = q->head ; m ; m = m->m_nextpkt ) { 1495 struct dn_pkt_tag *pkt = dn_tag_get(m) ; 1496 if (pkt->rule == r) 1497 pkt->rule = ip_fw_default_rule ; 1498 } 1499} 1500/* 1501 * when a firewall rule is deleted, scan all queues and remove the flow-id 1502 * from packets matching this rule. 1503 */ 1504void 1505dn_rule_delete(void *r) 1506{ 1507 struct dn_pipe *p ; 1508 struct dn_flow_set *fs ; 1509 struct dn_pkt_tag *pkt ; 1510 struct mbuf *m ; 1511 1512 lck_mtx_lock(dn_mutex); 1513 1514 /* 1515 * If the rule references a queue (dn_flow_set), then scan 1516 * the flow set, otherwise scan pipes. Should do either, but doing 1517 * both does not harm. 1518 */ 1519 for ( fs = all_flow_sets ; fs ; fs = fs->next ) 1520 dn_rule_delete_fs(fs, r); 1521 for ( p = all_pipes ; p ; p = p->next ) { 1522 fs = &(p->fs) ; 1523 dn_rule_delete_fs(fs, r); 1524 for (m = p->head ; m ; m = m->m_nextpkt ) { 1525 pkt = dn_tag_get(m) ; 1526 if (pkt->rule == r) 1527 pkt->rule = ip_fw_default_rule ; 1528 } 1529 } 1530 lck_mtx_unlock(dn_mutex); 1531} 1532 1533/* 1534 * setup RED parameters 1535 */ 1536static int 1537config_red(struct dn_flow_set *p, struct dn_flow_set * x) 1538{ 1539 int i; 1540 1541 x->w_q = p->w_q; 1542 x->min_th = SCALE(p->min_th); 1543 x->max_th = SCALE(p->max_th); 1544 x->max_p = p->max_p; 1545 1546 x->c_1 = p->max_p / (p->max_th - p->min_th); 1547 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); 1548 if (x->flags_fs & DN_IS_GENTLE_RED) { 1549 x->c_3 = (SCALE(1) - p->max_p) / p->max_th; 1550 x->c_4 = (SCALE(1) - 2 * p->max_p); 1551 } 1552 1553 /* if the lookup table already exist, free and create it again */ 1554 if (x->w_q_lookup) { 1555 FREE(x->w_q_lookup, M_DUMMYNET); 1556 x->w_q_lookup = NULL ; 1557 } 1558 if (red_lookup_depth == 0) { 1559 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n"); 1560 FREE(x, M_DUMMYNET); 1561 return EINVAL; 1562 } 1563 x->lookup_depth = red_lookup_depth; 1564 x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int), 1565 M_DUMMYNET, M_DONTWAIT); 1566 if (x->w_q_lookup == NULL) { 1567 printf("dummynet: sorry, cannot allocate red lookup table\n"); 1568 FREE(x, M_DUMMYNET); 1569 return ENOSPC; 1570 } 1571 1572 /* fill the lookup table with (1 - w_q)^x */ 1573 x->lookup_step = p->lookup_step ; 1574 x->lookup_weight = p->lookup_weight ; 1575 x->w_q_lookup[0] = SCALE(1) - x->w_q; 1576 for (i = 1; i < x->lookup_depth; i++) 1577 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); 1578 if (red_avg_pkt_size < 1) 1579 red_avg_pkt_size = 512 ; 1580 x->avg_pkt_size = red_avg_pkt_size ; 1581 if (red_max_pkt_size < 1) 1582 red_max_pkt_size = 1500 ; 1583 x->max_pkt_size = red_max_pkt_size ; 1584 return 0 ; 1585} 1586 1587static int 1588alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) 1589{ 1590 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ 1591 int l = pfs->rq_size; 1592 1593 if (l == 0) 1594 l = dn_hash_size; 1595 if (l < 4) 1596 l = 4; 1597 else if (l > DN_MAX_HASH_SIZE) 1598 l = DN_MAX_HASH_SIZE; 1599 x->rq_size = l; 1600 } else /* one is enough for null mask */ 1601 x->rq_size = 1; 1602 x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *), 1603 M_DUMMYNET, M_DONTWAIT | M_ZERO); 1604 if (x->rq == NULL) { 1605 printf("dummynet: sorry, cannot allocate queue\n"); 1606 return ENOSPC; 1607 } 1608 x->rq_elements = 0; 1609 return 0 ; 1610} 1611 1612static void 1613set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) 1614{ 1615 x->flags_fs = src->flags_fs; 1616 x->qsize = src->qsize; 1617 x->plr = src->plr; 1618 x->flow_mask = src->flow_mask; 1619 if (x->flags_fs & DN_QSIZE_IS_BYTES) { 1620 if (x->qsize > 1024*1024) 1621 x->qsize = 1024*1024 ; 1622 } else { 1623 if (x->qsize == 0) 1624 x->qsize = 50 ; 1625 if (x->qsize > 100) 1626 x->qsize = 50 ; 1627 } 1628 /* configuring RED */ 1629 if ( x->flags_fs & DN_IS_RED ) 1630 config_red(src, x) ; /* XXX should check errors */ 1631} 1632 1633/* 1634 * setup pipe or queue parameters. 1635 */ 1636 1637static int 1638config_pipe(struct dn_pipe *p) 1639{ 1640 int i, r; 1641 struct dn_flow_set *pfs = &(p->fs); 1642 struct dn_flow_queue *q; 1643 1644 /* 1645 * The config program passes parameters as follows: 1646 * bw = bits/second (0 means no limits), 1647 * delay = ms, must be translated into ticks. 1648 * qsize = slots/bytes 1649 */ 1650 p->delay = ( p->delay * (hz*10) ) / 1000 ; 1651 /* We need either a pipe number or a flow_set number */ 1652 if (p->pipe_nr == 0 && pfs->fs_nr == 0) 1653 return EINVAL ; 1654 if (p->pipe_nr != 0 && pfs->fs_nr != 0) 1655 return EINVAL ; 1656 if (p->pipe_nr != 0) { /* this is a pipe */ 1657 struct dn_pipe *x, *a, *b; 1658 1659 lck_mtx_lock(dn_mutex); 1660/* locate pipe */ 1661 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; 1662 a = b , b = b->next) ; 1663 1664 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ 1665 x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ; 1666 if (x == NULL) { 1667 lck_mtx_unlock(dn_mutex); 1668 printf("dummynet: no memory for new pipe\n"); 1669 return ENOSPC; 1670 } 1671 x->pipe_nr = p->pipe_nr; 1672 x->fs.pipe = x ; 1673 /* idle_heap is the only one from which we extract from the middle. 1674 */ 1675 x->idle_heap.size = x->idle_heap.elements = 0 ; 1676 x->idle_heap.offset=OFFSET_OF(struct dn_flow_queue, heap_pos); 1677 } else { 1678 x = b; 1679 /* Flush accumulated credit for all queues */ 1680 for (i = 0; i <= x->fs.rq_size; i++) 1681 for (q = x->fs.rq[i]; q; q = q->next) 1682 q->numbytes = 0; 1683 } 1684 1685 x->bandwidth = p->bandwidth ; 1686 x->numbytes = 0; /* just in case... */ 1687 bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); 1688 x->ifp = NULL ; /* reset interface ptr */ 1689 x->delay = p->delay ; 1690 set_fs_parms(&(x->fs), pfs); 1691 1692 1693 if ( x->fs.rq == NULL ) { /* a new pipe */ 1694 r = alloc_hash(&(x->fs), pfs) ; 1695 if (r) { 1696 lck_mtx_unlock(dn_mutex); 1697 FREE(x, M_DUMMYNET); 1698 return r ; 1699 } 1700 x->next = b ; 1701 if (a == NULL) 1702 all_pipes = x ; 1703 else 1704 a->next = x ; 1705 } 1706 lck_mtx_unlock(dn_mutex); 1707 } else { /* config queue */ 1708 struct dn_flow_set *x, *a, *b ; 1709 1710 lck_mtx_lock(dn_mutex); 1711 /* locate flow_set */ 1712 for (a=NULL, b=all_flow_sets ; b && b->fs_nr < pfs->fs_nr ; 1713 a = b , b = b->next) ; 1714 1715 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ 1716 if (pfs->parent_nr == 0) { /* need link to a pipe */ 1717 lck_mtx_unlock(dn_mutex); 1718 return EINVAL ; 1719 } 1720 x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO); 1721 if (x == NULL) { 1722 lck_mtx_unlock(dn_mutex); 1723 printf("dummynet: no memory for new flow_set\n"); 1724 return ENOSPC; 1725 } 1726 x->fs_nr = pfs->fs_nr; 1727 x->parent_nr = pfs->parent_nr; 1728 x->weight = pfs->weight ; 1729 if (x->weight == 0) 1730 x->weight = 1 ; 1731 else if (x->weight > 100) 1732 x->weight = 100 ; 1733 } else { 1734 /* Change parent pipe not allowed; must delete and recreate */ 1735 if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) { 1736 lck_mtx_unlock(dn_mutex); 1737 return EINVAL ; 1738 } 1739 x = b; 1740 } 1741 set_fs_parms(x, pfs); 1742 1743 if ( x->rq == NULL ) { /* a new flow_set */ 1744 r = alloc_hash(x, pfs) ; 1745 if (r) { 1746 lck_mtx_unlock(dn_mutex); 1747 FREE(x, M_DUMMYNET); 1748 return r ; 1749 } 1750 x->next = b; 1751 if (a == NULL) 1752 all_flow_sets = x; 1753 else 1754 a->next = x; 1755 } 1756 lck_mtx_unlock(dn_mutex); 1757 } 1758 return 0 ; 1759} 1760 1761/* 1762 * Helper function to remove from a heap queues which are linked to 1763 * a flow_set about to be deleted. 1764 */ 1765static void 1766fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) 1767{ 1768 int i = 0, found = 0 ; 1769 for (; i < h->elements ;) 1770 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { 1771 h->elements-- ; 1772 h->p[i] = h->p[h->elements] ; 1773 found++ ; 1774 } else 1775 i++ ; 1776 if (found) 1777 heapify(h); 1778} 1779 1780/* 1781 * helper function to remove a pipe from a heap (can be there at most once) 1782 */ 1783static void 1784pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) 1785{ 1786 if (h->elements > 0) { 1787 int i = 0 ; 1788 for (i=0; i < h->elements ; i++ ) { 1789 if (h->p[i].object == p) { /* found it */ 1790 h->elements-- ; 1791 h->p[i] = h->p[h->elements] ; 1792 heapify(h); 1793 break ; 1794 } 1795 } 1796 } 1797} 1798 1799/* 1800 * drain all queues. Called in case of severe mbuf shortage. 1801 */ 1802void 1803dummynet_drain(void) 1804{ 1805 struct dn_flow_set *fs; 1806 struct dn_pipe *p; 1807 struct mbuf *m, *mnext; 1808 1809 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 1810 1811 heap_free(&ready_heap); 1812 heap_free(&wfq_ready_heap); 1813 heap_free(&extract_heap); 1814 /* remove all references to this pipe from flow_sets */ 1815 for (fs = all_flow_sets; fs; fs= fs->next ) 1816 purge_flow_set(fs, 0); 1817 1818 for (p = all_pipes; p; p= p->next ) { 1819 purge_flow_set(&(p->fs), 0); 1820 1821 mnext = p->head; 1822 while ((m = mnext) != NULL) { 1823 mnext = m->m_nextpkt; 1824 DN_FREE_PKT(m); 1825 } 1826 p->head = p->tail = NULL ; 1827 } 1828} 1829 1830/* 1831 * Fully delete a pipe or a queue, cleaning up associated info. 1832 */ 1833static int 1834delete_pipe(struct dn_pipe *p) 1835{ 1836 if (p->pipe_nr == 0 && p->fs.fs_nr == 0) 1837 return EINVAL ; 1838 if (p->pipe_nr != 0 && p->fs.fs_nr != 0) 1839 return EINVAL ; 1840 if (p->pipe_nr != 0) { /* this is an old-style pipe */ 1841 struct dn_pipe *a, *b; 1842 struct dn_flow_set *fs; 1843 1844 lck_mtx_lock(dn_mutex); 1845 /* locate pipe */ 1846 for (a = NULL , b = all_pipes ; b && b->pipe_nr < p->pipe_nr ; 1847 a = b , b = b->next) ; 1848 if (b == NULL || (b->pipe_nr != p->pipe_nr) ) { 1849 lck_mtx_unlock(dn_mutex); 1850 return EINVAL ; /* not found */ 1851 } 1852 1853 /* unlink from list of pipes */ 1854 if (a == NULL) 1855 all_pipes = b->next ; 1856 else 1857 a->next = b->next ; 1858 /* remove references to this pipe from the ip_fw rules. */ 1859 flush_pipe_ptrs(&(b->fs)); 1860 1861 /* remove all references to this pipe from flow_sets */ 1862 for (fs = all_flow_sets; fs; fs= fs->next ) 1863 if (fs->pipe == b) { 1864 printf("dummynet: ++ ref to pipe %d from fs %d\n", 1865 p->pipe_nr, fs->fs_nr); 1866 fs->pipe = NULL ; 1867 purge_flow_set(fs, 0); 1868 } 1869 fs_remove_from_heap(&ready_heap, &(b->fs)); 1870 purge_pipe(b); /* remove all data associated to this pipe */ 1871 /* remove reference to here from extract_heap and wfq_ready_heap */ 1872 pipe_remove_from_heap(&extract_heap, b); 1873 pipe_remove_from_heap(&wfq_ready_heap, b); 1874 lck_mtx_unlock(dn_mutex); 1875 1876 FREE(b, M_DUMMYNET); 1877 } else { /* this is a WF2Q queue (dn_flow_set) */ 1878 struct dn_flow_set *a, *b; 1879 1880 lck_mtx_lock(dn_mutex); 1881 /* locate set */ 1882 for (a = NULL, b = all_flow_sets ; b && b->fs_nr < p->fs.fs_nr ; 1883 a = b , b = b->next) ; 1884 if (b == NULL || (b->fs_nr != p->fs.fs_nr) ) { 1885 lck_mtx_unlock(dn_mutex); 1886 return EINVAL ; /* not found */ 1887 } 1888 1889 if (a == NULL) 1890 all_flow_sets = b->next ; 1891 else 1892 a->next = b->next ; 1893 /* remove references to this flow_set from the ip_fw rules. */ 1894 flush_pipe_ptrs(b); 1895 1896 if (b->pipe != NULL) { 1897 /* Update total weight on parent pipe and cleanup parent heaps */ 1898 b->pipe->sum -= b->weight * b->backlogged ; 1899 fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); 1900 fs_remove_from_heap(&(b->pipe->scheduler_heap), b); 1901#if 1 /* XXX should i remove from idle_heap as well ? */ 1902 fs_remove_from_heap(&(b->pipe->idle_heap), b); 1903#endif 1904 } 1905 purge_flow_set(b, 1); 1906 lck_mtx_unlock(dn_mutex); 1907 } 1908 return 0 ; 1909} 1910 1911/* 1912 * helper function used to copy data from kernel in DUMMYNET_GET 1913 */ 1914static char * 1915dn_copy_set(struct dn_flow_set *set, char *bp) 1916{ 1917 int i, copied = 0 ; 1918 struct dn_flow_queue *q, *qp = (struct dn_flow_queue *)bp; 1919 1920 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 1921 1922 for (i = 0 ; i <= set->rq_size ; i++) 1923 for (q = set->rq[i] ; q ; q = q->next, qp++ ) { 1924 if (q->hash_slot != i) 1925 printf("dummynet: ++ at %d: wrong slot (have %d, " 1926 "should be %d)\n", copied, q->hash_slot, i); 1927 if (q->fs != set) 1928 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n", 1929 i, q->fs, set); 1930 copied++ ; 1931 bcopy(q, qp, sizeof(*q)); 1932 /* cleanup pointers */ 1933 qp->next = NULL ; 1934 qp->head = qp->tail = NULL ; 1935 qp->fs = NULL ; 1936 } 1937 if (copied != set->rq_elements) 1938 printf("dummynet: ++ wrong count, have %d should be %d\n", 1939 copied, set->rq_elements); 1940 return (char *)qp ; 1941} 1942 1943static size_t 1944dn_calc_size(void) 1945{ 1946 struct dn_flow_set *set ; 1947 struct dn_pipe *p ; 1948 size_t size ; 1949 1950 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 1951 1952 /* 1953 * compute size of data structures: list of pipes and flow_sets. 1954 */ 1955 for (p = all_pipes, size = 0 ; p ; p = p->next ) 1956 size += sizeof(*p) + 1957 p->fs.rq_elements * sizeof(struct dn_flow_queue); 1958 for (set = all_flow_sets ; set ; set = set->next ) 1959 size += sizeof(*set) + 1960 set->rq_elements * sizeof(struct dn_flow_queue); 1961 return size ; 1962} 1963 1964static int 1965dummynet_get(struct sockopt *sopt) 1966{ 1967 char *buf, *bp ; /* bp is the "copy-pointer" */ 1968 size_t size ; 1969 struct dn_flow_set *set ; 1970 struct dn_pipe *p ; 1971 int error=0, i ; 1972 1973 /* XXX lock held too long */ 1974 lck_mtx_lock(dn_mutex); 1975 /* 1976 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we 1977 * cannot use this flag while holding a mutex. 1978 */ 1979 for (i = 0; i < 10; i++) { 1980 size = dn_calc_size(); 1981 lck_mtx_unlock(dn_mutex); 1982 buf = _MALLOC(size, M_TEMP, M_WAITOK); 1983 lck_mtx_lock(dn_mutex); 1984 if (size == dn_calc_size()) 1985 break; 1986 FREE(buf, M_TEMP); 1987 buf = NULL; 1988 } 1989 if (buf == NULL) { 1990 lck_mtx_unlock(dn_mutex); 1991 return ENOBUFS ; 1992 } 1993 for (p = all_pipes, bp = buf ; p ; p = p->next ) { 1994 struct dn_pipe *pipe_bp = (struct dn_pipe *)bp ; 1995 1996 /* 1997 * copy pipe descriptor into *bp, convert delay back to ms, 1998 * then copy the flow_set descriptor(s) one at a time. 1999 * After each flow_set, copy the queue descriptor it owns. 2000 */ 2001 bcopy(p, bp, sizeof(*p)); 2002 pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; 2003 /* 2004 * XXX the following is a hack based on ->next being the 2005 * first field in dn_pipe and dn_flow_set. The correct 2006 * solution would be to move the dn_flow_set to the beginning 2007 * of struct dn_pipe. 2008 */ 2009 pipe_bp->next = (struct dn_pipe *)DN_IS_PIPE ; 2010 /* clean pointers */ 2011 pipe_bp->head = pipe_bp->tail = NULL ; 2012 pipe_bp->fs.next = NULL ; 2013 pipe_bp->fs.pipe = NULL ; 2014 pipe_bp->fs.rq = NULL ; 2015 2016 bp += sizeof(*p); 2017 bp = dn_copy_set( &(p->fs), bp ); 2018 } 2019 for (set = all_flow_sets ; set ; set = set->next ) { 2020 struct dn_flow_set *fs_bp = (struct dn_flow_set *)bp ; 2021 bcopy(set, bp, sizeof(*set)); 2022 /* XXX same hack as above */ 2023 fs_bp->next = (struct dn_flow_set *)DN_IS_QUEUE ; 2024 fs_bp->pipe = NULL ; 2025 fs_bp->rq = NULL ; 2026 bp += sizeof(*set); 2027 bp = dn_copy_set( set, bp ); 2028 } 2029 lck_mtx_unlock(dn_mutex); 2030 2031 error = sooptcopyout(sopt, buf, size); 2032 FREE(buf, M_TEMP); 2033 return error ; 2034} 2035 2036/* 2037 * Handler for the various dummynet socket options (get, flush, config, del) 2038 */ 2039static int 2040ip_dn_ctl(struct sockopt *sopt) 2041{ 2042 int error = 0 ; 2043 struct dn_pipe *p, tmp_pipe; 2044 2045 /* Disallow sets in really-really secure mode. */ 2046 if (sopt->sopt_dir == SOPT_SET && securelevel >= 3) 2047 return (EPERM); 2048 2049 switch (sopt->sopt_name) { 2050 default : 2051 printf("dummynet: -- unknown option %d", sopt->sopt_name); 2052 return EINVAL ; 2053 2054 case IP_DUMMYNET_GET : 2055 error = dummynet_get(sopt); 2056 break ; 2057 2058 case IP_DUMMYNET_FLUSH : 2059 dummynet_flush() ; 2060 break ; 2061 2062 case IP_DUMMYNET_CONFIGURE : 2063 p = &tmp_pipe ; 2064 error = sooptcopyin(sopt, p, sizeof(*p), sizeof(*p)); 2065 if (error) 2066 break ; 2067 error = config_pipe(p); 2068 break ; 2069 2070 case IP_DUMMYNET_DEL : /* remove a pipe or queue */ 2071 p = &tmp_pipe ; 2072 error = sooptcopyin(sopt, p, sizeof(*p), sizeof(*p)); 2073 if (error) 2074 break ; 2075 2076 error = delete_pipe(p); 2077 break ; 2078 } 2079 return error ; 2080} 2081 2082void 2083ip_dn_init(void) 2084{ 2085 /* setup locks */ 2086 dn_mutex_grp_attr = lck_grp_attr_alloc_init(); 2087 dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr); 2088 dn_mutex_attr = lck_attr_alloc_init(); 2089 2090 if ((dn_mutex = lck_mtx_alloc_init(dn_mutex_grp, dn_mutex_attr)) == NULL) { 2091 printf("ip_dn_init: can't alloc dn_mutex\n"); 2092 return; 2093 } 2094 2095 all_pipes = NULL ; 2096 all_flow_sets = NULL ; 2097 ready_heap.size = ready_heap.elements = 0 ; 2098 ready_heap.offset = 0 ; 2099 2100 wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; 2101 wfq_ready_heap.offset = 0 ; 2102 2103 extract_heap.size = extract_heap.elements = 0 ; 2104 extract_heap.offset = 0 ; 2105 ip_dn_ctl_ptr = ip_dn_ctl; 2106 ip_dn_io_ptr = dummynet_io; 2107 ip_dn_ruledel_ptr = dn_rule_delete; 2108} 2109