1/* 2 * Copyright (c) 2000-2012 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa 30 * Portions Copyright (c) 2000 Akamba Corp. 31 * All rights reserved 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 * 54 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $ 55 */ 56 57#define DUMMYNET_DEBUG 58 59/* 60 * This module implements IP dummynet, a bandwidth limiter/delay emulator 61 * used in conjunction with the ipfw package. 62 * Description of the data structures used is in ip_dummynet.h 63 * Here you mainly find the following blocks of code: 64 * + variable declarations; 65 * + heap management functions; 66 * + scheduler and dummynet functions; 67 * + configuration and initialization. 68 * 69 * NOTA BENE: critical sections are protected by the "dummynet lock". 70 * 71 * Most important Changes: 72 * 73 * 010124: Fixed WF2Q behaviour 74 * 010122: Fixed spl protection. 75 * 000601: WF2Q support 76 * 000106: large rewrite, use heaps to handle very many pipes. 77 * 980513: initial release 78 * 79 * include files marked with XXX are probably not needed 80 */ 81 82#include <sys/param.h> 83#include <sys/systm.h> 84#include <sys/malloc.h> 85#include <sys/mbuf.h> 86#include <sys/queue.h> /* XXX */ 87#include <sys/kernel.h> 88#include <sys/socket.h> 89#include <sys/socketvar.h> 90#include <sys/time.h> 91#include <sys/sysctl.h> 92#include <net/if.h> 93#include <net/route.h> 94#include <net/kpi_protocol.h> 95#if DUMMYNET 96#include <net/kpi_protocol.h> 97#endif /* DUMMYNET */ 98#include <netinet/in.h> 99#include <netinet/in_systm.h> 100#include <netinet/in_var.h> 101#include <netinet/ip.h> 102#include <netinet/ip_fw.h> 103#include <netinet/ip_dummynet.h> 104#include <netinet/ip_var.h> 105 106#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ 107#include <netinet6/ip6_var.h> 108 109static struct ip_fw default_rule; 110 111/* 112 * We keep a private variable for the simulation time, but we could 113 * probably use an existing one ("softticks" in sys/kern/kern_timer.c) 114 */ 115static dn_key curr_time = 0 ; /* current simulation time */ 116 117/* this is for the timer that fires to call dummynet() - we only enable the timer when 118 there are packets to process, otherwise it's disabled */ 119static int timer_enabled = 0; 120 121static int dn_hash_size = 64 ; /* default hash size */ 122 123/* statistics on number of queue searches and search steps */ 124static int searches, search_steps ; 125static int pipe_expire = 1 ; /* expire queue if empty */ 126static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ 127 128static int red_lookup_depth = 256; /* RED - default lookup table depth */ 129static int red_avg_pkt_size = 512; /* RED - default medium packet size */ 130static int red_max_pkt_size = 1500; /* RED - default max packet size */ 131 132static int serialize = 0; 133 134/* 135 * Three heaps contain queues and pipes that the scheduler handles: 136 * 137 * ready_heap contains all dn_flow_queue related to fixed-rate pipes. 138 * 139 * wfq_ready_heap contains the pipes associated with WF2Q flows 140 * 141 * extract_heap contains pipes associated with delay lines. 142 * 143 */ 144static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; 145 146static int heap_init(struct dn_heap *h, int size) ; 147static int heap_insert (struct dn_heap *h, dn_key key1, void *p); 148static void heap_extract(struct dn_heap *h, void *obj); 149 150 151static void transmit_event(struct dn_pipe *pipe, struct mbuf **head, 152 struct mbuf **tail); 153static void ready_event(struct dn_flow_queue *q, struct mbuf **head, 154 struct mbuf **tail); 155static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, 156 struct mbuf **tail); 157 158/* 159 * Packets are retrieved from queues in Dummynet in chains instead of 160 * packet-by-packet. The entire list of packets is first dequeued and 161 * sent out by the following function. 162 */ 163static void dummynet_send(struct mbuf *m); 164 165#define HASHSIZE 16 166#define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f) 167static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */ 168static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */ 169 170 171#ifdef SYSCTL_NODE 172SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, 173 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Dummynet"); 174SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, 175 CTLFLAG_RW | CTLFLAG_LOCKED, &dn_hash_size, 0, "Default hash table size"); 176SYSCTL_QUAD(_net_inet_ip_dummynet, OID_AUTO, curr_time, 177 CTLFLAG_RD | CTLFLAG_LOCKED, &curr_time, "Current tick"); 178SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, 179 CTLFLAG_RD | CTLFLAG_LOCKED, &ready_heap.size, 0, "Size of ready heap"); 180SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, 181 CTLFLAG_RD | CTLFLAG_LOCKED, &extract_heap.size, 0, "Size of extract heap"); 182SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, 183 CTLFLAG_RD | CTLFLAG_LOCKED, &searches, 0, "Number of queue searches"); 184SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, 185 CTLFLAG_RD | CTLFLAG_LOCKED, &search_steps, 0, "Number of queue search steps"); 186SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, 187 CTLFLAG_RW | CTLFLAG_LOCKED, &pipe_expire, 0, "Expire queue if empty"); 188SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, 189 CTLFLAG_RW | CTLFLAG_LOCKED, &dn_max_ratio, 0, 190 "Max ratio between dynamic queues and buckets"); 191SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, 192 CTLFLAG_RD | CTLFLAG_LOCKED, &red_lookup_depth, 0, "Depth of RED lookup table"); 193SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, 194 CTLFLAG_RD | CTLFLAG_LOCKED, &red_avg_pkt_size, 0, "RED Medium packet size"); 195SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, 196 CTLFLAG_RD | CTLFLAG_LOCKED, &red_max_pkt_size, 0, "RED Max packet size"); 197#endif 198 199#ifdef DUMMYNET_DEBUG 200int dummynet_debug = 0; 201#ifdef SYSCTL_NODE 202SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &dummynet_debug, 203 0, "control debugging printfs"); 204#endif 205#define DPRINTF(X) if (dummynet_debug) printf X 206#else 207#define DPRINTF(X) 208#endif 209 210/* contrary to the comment above random(), it does not actually 211 * return a value [0, 2^31 - 1], which breaks plr amongst other 212 * things. Masking it should work even if the behavior of 213 * the function is fixed. 214 */ 215#define MY_RANDOM (random() & 0x7FFFFFFF) 216 217/* dummynet lock */ 218static lck_grp_t *dn_mutex_grp; 219static lck_grp_attr_t *dn_mutex_grp_attr; 220static lck_attr_t *dn_mutex_attr; 221decl_lck_mtx_data(static, dn_mutex_data); 222static lck_mtx_t *dn_mutex = &dn_mutex_data; 223 224static int config_pipe(struct dn_pipe *p); 225static int ip_dn_ctl(struct sockopt *sopt); 226 227static void dummynet(void *); 228static void dummynet_flush(void); 229void dummynet_drain(void); 230static ip_dn_io_t dummynet_io; 231 232int if_tx_rdy(struct ifnet *ifp); 233 234static void cp_flow_set_to_64_user(struct dn_flow_set *set, struct dn_flow_set_64 *fs_bp); 235static void cp_queue_to_64_user( struct dn_flow_queue *q, struct dn_flow_queue_64 *qp); 236static char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp); 237static char* dn_copy_set_64(struct dn_flow_set *set, char *bp); 238static int cp_pipe_from_user_64( struct sockopt *sopt, struct dn_pipe *p ); 239 240static void cp_flow_set_to_32_user(struct dn_flow_set *set, struct dn_flow_set_32 *fs_bp); 241static void cp_queue_to_32_user( struct dn_flow_queue *q, struct dn_flow_queue_32 *qp); 242static char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp); 243static char* dn_copy_set_32(struct dn_flow_set *set, char *bp); 244static int cp_pipe_from_user_32( struct sockopt *sopt, struct dn_pipe *p ); 245 246 247/* 248 * Heap management functions. 249 * 250 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. 251 * Some macros help finding parent/children so we can optimize them. 252 * 253 * heap_init() is called to expand the heap when needed. 254 * Increment size in blocks of 16 entries. 255 * XXX failure to allocate a new element is a pretty bad failure 256 * as we basically stall a whole queue forever!! 257 * Returns 1 on error, 0 on success 258 */ 259#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) 260#define HEAP_LEFT(x) ( 2*(x) + 1 ) 261#define HEAP_IS_LEFT(x) ( (x) & 1 ) 262#define HEAP_RIGHT(x) ( 2*(x) + 2 ) 263#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } 264#define HEAP_INCREMENT 15 265 266 267int cp_pipe_from_user_32( struct sockopt *sopt, struct dn_pipe *p ) 268{ 269 struct dn_pipe_32 user_pipe_32; 270 int error=0; 271 272 error = sooptcopyin(sopt, &user_pipe_32, sizeof(struct dn_pipe_32), sizeof(struct dn_pipe_32)); 273 if ( !error ){ 274 p->pipe_nr = user_pipe_32.pipe_nr; 275 p->bandwidth = user_pipe_32.bandwidth; 276 p->delay = user_pipe_32.delay; 277 p->V = user_pipe_32.V; 278 p->sum = user_pipe_32.sum; 279 p->numbytes = user_pipe_32.numbytes; 280 p->sched_time = user_pipe_32.sched_time; 281 bcopy( user_pipe_32.if_name, p->if_name, IFNAMSIZ); 282 p->ready = user_pipe_32.ready; 283 284 p->fs.fs_nr = user_pipe_32.fs.fs_nr; 285 p->fs.flags_fs = user_pipe_32.fs.flags_fs; 286 p->fs.parent_nr = user_pipe_32.fs.parent_nr; 287 p->fs.weight = user_pipe_32.fs.weight; 288 p->fs.qsize = user_pipe_32.fs.qsize; 289 p->fs.plr = user_pipe_32.fs.plr; 290 p->fs.flow_mask = user_pipe_32.fs.flow_mask; 291 p->fs.rq_size = user_pipe_32.fs.rq_size; 292 p->fs.rq_elements = user_pipe_32.fs.rq_elements; 293 p->fs.last_expired = user_pipe_32.fs.last_expired; 294 p->fs.backlogged = user_pipe_32.fs.backlogged; 295 p->fs.w_q = user_pipe_32.fs.w_q; 296 p->fs.max_th = user_pipe_32.fs.max_th; 297 p->fs.min_th = user_pipe_32.fs.min_th; 298 p->fs.max_p = user_pipe_32.fs.max_p; 299 p->fs.c_1 = user_pipe_32.fs.c_1; 300 p->fs.c_2 = user_pipe_32.fs.c_2; 301 p->fs.c_3 = user_pipe_32.fs.c_3; 302 p->fs.c_4 = user_pipe_32.fs.c_4; 303 p->fs.lookup_depth = user_pipe_32.fs.lookup_depth; 304 p->fs.lookup_step = user_pipe_32.fs.lookup_step; 305 p->fs.lookup_weight = user_pipe_32.fs.lookup_weight; 306 p->fs.avg_pkt_size = user_pipe_32.fs.avg_pkt_size; 307 p->fs.max_pkt_size = user_pipe_32.fs.max_pkt_size; 308 } 309 return error; 310} 311 312 313int cp_pipe_from_user_64( struct sockopt *sopt, struct dn_pipe *p ) 314{ 315 struct dn_pipe_64 user_pipe_64; 316 int error=0; 317 318 error = sooptcopyin(sopt, &user_pipe_64, sizeof(struct dn_pipe_64), sizeof(struct dn_pipe_64)); 319 if ( !error ){ 320 p->pipe_nr = user_pipe_64.pipe_nr; 321 p->bandwidth = user_pipe_64.bandwidth; 322 p->delay = user_pipe_64.delay; 323 p->V = user_pipe_64.V; 324 p->sum = user_pipe_64.sum; 325 p->numbytes = user_pipe_64.numbytes; 326 p->sched_time = user_pipe_64.sched_time; 327 bcopy( user_pipe_64.if_name, p->if_name, IFNAMSIZ); 328 p->ready = user_pipe_64.ready; 329 330 p->fs.fs_nr = user_pipe_64.fs.fs_nr; 331 p->fs.flags_fs = user_pipe_64.fs.flags_fs; 332 p->fs.parent_nr = user_pipe_64.fs.parent_nr; 333 p->fs.weight = user_pipe_64.fs.weight; 334 p->fs.qsize = user_pipe_64.fs.qsize; 335 p->fs.plr = user_pipe_64.fs.plr; 336 p->fs.flow_mask = user_pipe_64.fs.flow_mask; 337 p->fs.rq_size = user_pipe_64.fs.rq_size; 338 p->fs.rq_elements = user_pipe_64.fs.rq_elements; 339 p->fs.last_expired = user_pipe_64.fs.last_expired; 340 p->fs.backlogged = user_pipe_64.fs.backlogged; 341 p->fs.w_q = user_pipe_64.fs.w_q; 342 p->fs.max_th = user_pipe_64.fs.max_th; 343 p->fs.min_th = user_pipe_64.fs.min_th; 344 p->fs.max_p = user_pipe_64.fs.max_p; 345 p->fs.c_1 = user_pipe_64.fs.c_1; 346 p->fs.c_2 = user_pipe_64.fs.c_2; 347 p->fs.c_3 = user_pipe_64.fs.c_3; 348 p->fs.c_4 = user_pipe_64.fs.c_4; 349 p->fs.lookup_depth = user_pipe_64.fs.lookup_depth; 350 p->fs.lookup_step = user_pipe_64.fs.lookup_step; 351 p->fs.lookup_weight = user_pipe_64.fs.lookup_weight; 352 p->fs.avg_pkt_size = user_pipe_64.fs.avg_pkt_size; 353 p->fs.max_pkt_size = user_pipe_64.fs.max_pkt_size; 354 } 355 return error; 356} 357 358static void 359cp_flow_set_to_32_user(struct dn_flow_set *set, struct dn_flow_set_32 *fs_bp) 360{ 361 fs_bp->fs_nr = set->fs_nr; 362 fs_bp->flags_fs = set->flags_fs ; 363 fs_bp->parent_nr = set->parent_nr ; 364 fs_bp->weight = set->weight ; 365 fs_bp->qsize = set->qsize ; 366 fs_bp->plr = set->plr ; 367 fs_bp->flow_mask = set->flow_mask ; 368 fs_bp->rq_size = set->rq_size ; 369 fs_bp->rq_elements = set->rq_elements ; 370 fs_bp->last_expired = set->last_expired ; 371 fs_bp->backlogged = set->backlogged ; 372 fs_bp->w_q = set->w_q ; 373 fs_bp->max_th = set->max_th ; 374 fs_bp->min_th = set->min_th ; 375 fs_bp->max_p = set->max_p ; 376 fs_bp->c_1 = set->c_1 ; 377 fs_bp->c_2 = set->c_2 ; 378 fs_bp->c_3 = set->c_3 ; 379 fs_bp->c_4 = set->c_4 ; 380 fs_bp->w_q_lookup = CAST_DOWN_EXPLICIT(user32_addr_t, set->w_q_lookup) ; 381 fs_bp->lookup_depth = set->lookup_depth ; 382 fs_bp->lookup_step = set->lookup_step ; 383 fs_bp->lookup_weight = set->lookup_weight ; 384 fs_bp->avg_pkt_size = set->avg_pkt_size ; 385 fs_bp->max_pkt_size = set->max_pkt_size ; 386} 387 388static void 389cp_flow_set_to_64_user(struct dn_flow_set *set, struct dn_flow_set_64 *fs_bp) 390{ 391 fs_bp->fs_nr = set->fs_nr; 392 fs_bp->flags_fs = set->flags_fs ; 393 fs_bp->parent_nr = set->parent_nr ; 394 fs_bp->weight = set->weight ; 395 fs_bp->qsize = set->qsize ; 396 fs_bp->plr = set->plr ; 397 fs_bp->flow_mask = set->flow_mask ; 398 fs_bp->rq_size = set->rq_size ; 399 fs_bp->rq_elements = set->rq_elements ; 400 fs_bp->last_expired = set->last_expired ; 401 fs_bp->backlogged = set->backlogged ; 402 fs_bp->w_q = set->w_q ; 403 fs_bp->max_th = set->max_th ; 404 fs_bp->min_th = set->min_th ; 405 fs_bp->max_p = set->max_p ; 406 fs_bp->c_1 = set->c_1 ; 407 fs_bp->c_2 = set->c_2 ; 408 fs_bp->c_3 = set->c_3 ; 409 fs_bp->c_4 = set->c_4 ; 410 fs_bp->w_q_lookup = CAST_DOWN(user64_addr_t, set->w_q_lookup) ; 411 fs_bp->lookup_depth = set->lookup_depth ; 412 fs_bp->lookup_step = set->lookup_step ; 413 fs_bp->lookup_weight = set->lookup_weight ; 414 fs_bp->avg_pkt_size = set->avg_pkt_size ; 415 fs_bp->max_pkt_size = set->max_pkt_size ; 416} 417 418static 419void cp_queue_to_32_user( struct dn_flow_queue *q, struct dn_flow_queue_32 *qp) 420{ 421 qp->id = q->id; 422 qp->len = q->len; 423 qp->len_bytes = q->len_bytes; 424 qp->numbytes = q->numbytes; 425 qp->tot_pkts = q->tot_pkts; 426 qp->tot_bytes = q->tot_bytes; 427 qp->drops = q->drops; 428 qp->hash_slot = q->hash_slot; 429 qp->avg = q->avg; 430 qp->count = q->count; 431 qp->random = q->random; 432 qp->q_time = q->q_time; 433 qp->heap_pos = q->heap_pos; 434 qp->sched_time = q->sched_time; 435 qp->S = q->S; 436 qp->F = q->F; 437} 438 439static 440void cp_queue_to_64_user( struct dn_flow_queue *q, struct dn_flow_queue_64 *qp) 441{ 442 qp->id = q->id; 443 qp->len = q->len; 444 qp->len_bytes = q->len_bytes; 445 qp->numbytes = q->numbytes; 446 qp->tot_pkts = q->tot_pkts; 447 qp->tot_bytes = q->tot_bytes; 448 qp->drops = q->drops; 449 qp->hash_slot = q->hash_slot; 450 qp->avg = q->avg; 451 qp->count = q->count; 452 qp->random = q->random; 453 qp->q_time = q->q_time; 454 qp->heap_pos = q->heap_pos; 455 qp->sched_time = q->sched_time; 456 qp->S = q->S; 457 qp->F = q->F; 458} 459 460static 461char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp) 462{ 463 char *bp; 464 465 pipe_bp->pipe_nr = p->pipe_nr; 466 pipe_bp->bandwidth = p->bandwidth; 467 pipe_bp->delay = p->delay; 468 bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_32)); 469 pipe_bp->scheduler_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->scheduler_heap.p); 470 bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_32)); 471 pipe_bp->not_eligible_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->not_eligible_heap.p); 472 bcopy( &(p->idle_heap), &(pipe_bp->idle_heap), sizeof(struct dn_heap_32)); 473 pipe_bp->idle_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->idle_heap.p); 474 pipe_bp->V = p->V; 475 pipe_bp->sum = p->sum; 476 pipe_bp->numbytes = p->numbytes; 477 pipe_bp->sched_time = p->sched_time; 478 bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ); 479 pipe_bp->ifp = CAST_DOWN_EXPLICIT(user32_addr_t, p->ifp); 480 pipe_bp->ready = p->ready; 481 482 cp_flow_set_to_32_user( &(p->fs), &(pipe_bp->fs)); 483 484 pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; 485 /* 486 * XXX the following is a hack based on ->next being the 487 * first field in dn_pipe and dn_flow_set. The correct 488 * solution would be to move the dn_flow_set to the beginning 489 * of struct dn_pipe. 490 */ 491 pipe_bp->next = CAST_DOWN_EXPLICIT( user32_addr_t, DN_IS_PIPE ); 492 /* clean pointers */ 493 pipe_bp->head = pipe_bp->tail = (user32_addr_t) 0 ; 494 pipe_bp->fs.next = (user32_addr_t)0 ; 495 pipe_bp->fs.pipe = (user32_addr_t)0 ; 496 pipe_bp->fs.rq = (user32_addr_t)0 ; 497 bp = ((char *)pipe_bp) + sizeof(struct dn_pipe_32); 498 return( dn_copy_set_32( &(p->fs), bp) ); 499} 500 501static 502char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp) 503{ 504 char *bp; 505 506 pipe_bp->pipe_nr = p->pipe_nr; 507 pipe_bp->bandwidth = p->bandwidth; 508 pipe_bp->delay = p->delay; 509 bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_64)); 510 pipe_bp->scheduler_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->scheduler_heap.p); 511 bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_64)); 512 pipe_bp->not_eligible_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->not_eligible_heap.p); 513 bcopy( &(p->idle_heap), &(pipe_bp->idle_heap), sizeof(struct dn_heap_64)); 514 pipe_bp->idle_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->idle_heap.p); 515 pipe_bp->V = p->V; 516 pipe_bp->sum = p->sum; 517 pipe_bp->numbytes = p->numbytes; 518 pipe_bp->sched_time = p->sched_time; 519 bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ); 520 pipe_bp->ifp = CAST_DOWN(user64_addr_t, p->ifp); 521 pipe_bp->ready = p->ready; 522 523 cp_flow_set_to_64_user( &(p->fs), &(pipe_bp->fs)); 524 525 pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; 526 /* 527 * XXX the following is a hack based on ->next being the 528 * first field in dn_pipe and dn_flow_set. The correct 529 * solution would be to move the dn_flow_set to the beginning 530 * of struct dn_pipe. 531 */ 532 pipe_bp->next = CAST_DOWN( user64_addr_t, DN_IS_PIPE ); 533 /* clean pointers */ 534 pipe_bp->head = pipe_bp->tail = USER_ADDR_NULL ; 535 pipe_bp->fs.next = USER_ADDR_NULL ; 536 pipe_bp->fs.pipe = USER_ADDR_NULL ; 537 pipe_bp->fs.rq = USER_ADDR_NULL ; 538 bp = ((char *)pipe_bp) + sizeof(struct dn_pipe_64); 539 return( dn_copy_set_64( &(p->fs), bp) ); 540} 541 542static int 543heap_init(struct dn_heap *h, int new_size) 544{ 545 struct dn_heap_entry *p; 546 547 if (h->size >= new_size ) { 548 printf("dummynet: heap_init, Bogus call, have %d want %d\n", 549 h->size, new_size); 550 return 0 ; 551 } 552 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; 553 p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT ); 554 if (p == NULL) { 555 printf("dummynet: heap_init, resize %d failed\n", new_size ); 556 return 1 ; /* error */ 557 } 558 if (h->size > 0) { 559 bcopy(h->p, p, h->size * sizeof(*p) ); 560 FREE(h->p, M_DUMMYNET); 561 } 562 h->p = p ; 563 h->size = new_size ; 564 return 0 ; 565} 566 567/* 568 * Insert element in heap. Normally, p != NULL, we insert p in 569 * a new position and bubble up. If p == NULL, then the element is 570 * already in place, and key is the position where to start the 571 * bubble-up. 572 * Returns 1 on failure (cannot allocate new heap entry) 573 * 574 * If offset > 0 the position (index, int) of the element in the heap is 575 * also stored in the element itself at the given offset in bytes. 576 */ 577#define SET_OFFSET(heap, node) \ 578 if (heap->offset > 0) \ 579 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; 580/* 581 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. 582 */ 583#define RESET_OFFSET(heap, node) \ 584 if (heap->offset > 0) \ 585 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; 586static int 587heap_insert(struct dn_heap *h, dn_key key1, void *p) 588{ 589 int son = h->elements ; 590 591 if (p == NULL) /* data already there, set starting point */ 592 son = key1 ; 593 else { /* insert new element at the end, possibly resize */ 594 son = h->elements ; 595 if (son == h->size) /* need resize... */ 596 if (heap_init(h, h->elements+1) ) 597 return 1 ; /* failure... */ 598 h->p[son].object = p ; 599 h->p[son].key = key1 ; 600 h->elements++ ; 601 } 602 while (son > 0) { /* bubble up */ 603 int father = HEAP_FATHER(son) ; 604 struct dn_heap_entry tmp ; 605 606 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) 607 break ; /* found right position */ 608 /* son smaller than father, swap and repeat */ 609 HEAP_SWAP(h->p[son], h->p[father], tmp) ; 610 SET_OFFSET(h, son); 611 son = father ; 612 } 613 SET_OFFSET(h, son); 614 return 0 ; 615} 616 617/* 618 * remove top element from heap, or obj if obj != NULL 619 */ 620static void 621heap_extract(struct dn_heap *h, void *obj) 622{ 623 int child, father, maxelt = h->elements - 1 ; 624 625 if (maxelt < 0) { 626 printf("dummynet: warning, extract from empty heap 0x%p\n", h); 627 return ; 628 } 629 father = 0 ; /* default: move up smallest child */ 630 if (obj != NULL) { /* extract specific element, index is at offset */ 631 if (h->offset <= 0) 632 panic("dummynet: heap_extract from middle not supported on this heap!!!\n"); 633 father = *((int *)((char *)obj + h->offset)) ; 634 if (father < 0 || father >= h->elements) { 635 printf("dummynet: heap_extract, father %d out of bound 0..%d\n", 636 father, h->elements); 637 panic("dummynet: heap_extract"); 638 } 639 } 640 RESET_OFFSET(h, father); 641 child = HEAP_LEFT(father) ; /* left child */ 642 while (child <= maxelt) { /* valid entry */ 643 if (child != maxelt && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) 644 child = child+1 ; /* take right child, otherwise left */ 645 h->p[father] = h->p[child] ; 646 SET_OFFSET(h, father); 647 father = child ; 648 child = HEAP_LEFT(child) ; /* left child for next loop */ 649 } 650 h->elements-- ; 651 if (father != maxelt) { 652 /* 653 * Fill hole with last entry and bubble up, reusing the insert code 654 */ 655 h->p[father] = h->p[maxelt] ; 656 heap_insert(h, father, NULL); /* this one cannot fail */ 657 } 658} 659 660/* 661 * heapify() will reorganize data inside an array to maintain the 662 * heap property. It is needed when we delete a bunch of entries. 663 */ 664static void 665heapify(struct dn_heap *h) 666{ 667 int i ; 668 669 for (i = 0 ; i < h->elements ; i++ ) 670 heap_insert(h, i , NULL) ; 671} 672 673/* 674 * cleanup the heap and free data structure 675 */ 676static void 677heap_free(struct dn_heap *h) 678{ 679 if (h->size >0 ) 680 FREE(h->p, M_DUMMYNET); 681 bzero(h, sizeof(*h)); 682} 683 684/* 685 * --- end of heap management functions --- 686 */ 687 688/* 689 * Return the mbuf tag holding the dummynet state. As an optimization 690 * this is assumed to be the first tag on the list. If this turns out 691 * wrong we'll need to search the list. 692 */ 693static struct dn_pkt_tag * 694dn_tag_get(struct mbuf *m) 695{ 696 struct m_tag *mtag = m_tag_first(m); 697 698 if (!(mtag != NULL && 699 mtag->m_tag_id == KERNEL_MODULE_TAG_ID && 700 mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET)) 701 panic("packet on dummynet queue w/o dummynet tag: %p", m); 702 703 return (struct dn_pkt_tag *)(mtag+1); 704} 705 706/* 707 * Scheduler functions: 708 * 709 * transmit_event() is called when the delay-line needs to enter 710 * the scheduler, either because of existing pkts getting ready, 711 * or new packets entering the queue. The event handled is the delivery 712 * time of the packet. 713 * 714 * ready_event() does something similar with fixed-rate queues, and the 715 * event handled is the finish time of the head pkt. 716 * 717 * wfq_ready_event() does something similar with WF2Q queues, and the 718 * event handled is the start time of the head pkt. 719 * 720 * In all cases, we make sure that the data structures are consistent 721 * before passing pkts out, because this might trigger recursive 722 * invocations of the procedures. 723 */ 724static void 725transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) 726{ 727 struct mbuf *m ; 728 struct dn_pkt_tag *pkt = NULL; 729 u_int64_t schedule_time; 730 731 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 732 ASSERT(serialize >= 0); 733 if (serialize == 0) { 734 while ((m = pipe->head) != NULL) { 735 pkt = dn_tag_get(m); 736 if (!DN_KEY_LEQ(pkt->dn_output_time, curr_time)) 737 break; 738 739 pipe->head = m->m_nextpkt; 740 if (*tail != NULL) 741 (*tail)->m_nextpkt = m; 742 else 743 *head = m; 744 *tail = m; 745 } 746 747 if (*tail != NULL) 748 (*tail)->m_nextpkt = NULL; 749 } 750 751 schedule_time = pkt == NULL || DN_KEY_LEQ(pkt->dn_output_time, curr_time) ? 752 curr_time + 1 : pkt->dn_output_time; 753 754 /* if there are leftover packets, put the pipe into the heap for next ready event */ 755 if ((m = pipe->head) != NULL) { 756 pkt = dn_tag_get(m); 757 /* XXX should check errors on heap_insert, by draining the 758 * whole pipe p and hoping in the future we are more successful 759 */ 760 heap_insert(&extract_heap, schedule_time, pipe); 761 } 762} 763 764/* 765 * the following macro computes how many ticks we have to wait 766 * before being able to transmit a packet. The credit is taken from 767 * either a pipe (WF2Q) or a flow_queue (per-flow queueing) 768 */ 769 770/* hz is 100, which gives a granularity of 10ms in the old timer. 771 * The timer has been changed to fire every 1ms, so the use of 772 * hz has been modified here. All instances of hz have been left 773 * in place but adjusted by a factor of 10 so that hz is functionally 774 * equal to 1000. 775 */ 776#define SET_TICKS(_m, q, p) \ 777 ((_m)->m_pkthdr.len*8*(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \ 778 p->bandwidth ; 779 780/* 781 * extract pkt from queue, compute output time (could be now) 782 * and put into delay line (p_queue) 783 */ 784static void 785move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, 786 struct dn_pipe *p, int len) 787{ 788 struct dn_pkt_tag *dt = dn_tag_get(pkt); 789 790 q->head = pkt->m_nextpkt ; 791 q->len-- ; 792 q->len_bytes -= len ; 793 794 dt->dn_output_time = curr_time + p->delay ; 795 796 if (p->head == NULL) 797 p->head = pkt; 798 else 799 p->tail->m_nextpkt = pkt; 800 p->tail = pkt; 801 p->tail->m_nextpkt = NULL; 802} 803 804/* 805 * ready_event() is invoked every time the queue must enter the 806 * scheduler, either because the first packet arrives, or because 807 * a previously scheduled event fired. 808 * On invokation, drain as many pkts as possible (could be 0) and then 809 * if there are leftover packets reinsert the pkt in the scheduler. 810 */ 811static void 812ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail) 813{ 814 struct mbuf *pkt; 815 struct dn_pipe *p = q->fs->pipe ; 816 int p_was_empty ; 817 818 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 819 820 if (p == NULL) { 821 printf("dummynet: ready_event pipe is gone\n"); 822 return ; 823 } 824 p_was_empty = (p->head == NULL) ; 825 826 /* 827 * schedule fixed-rate queues linked to this pipe: 828 * Account for the bw accumulated since last scheduling, then 829 * drain as many pkts as allowed by q->numbytes and move to 830 * the delay line (in p) computing output time. 831 * bandwidth==0 (no limit) means we can drain the whole queue, 832 * setting len_scaled = 0 does the job. 833 */ 834 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; 835 while ( (pkt = q->head) != NULL ) { 836 int len = pkt->m_pkthdr.len; 837 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ; 838 if (len_scaled > q->numbytes ) 839 break ; 840 q->numbytes -= len_scaled ; 841 move_pkt(pkt, q, p, len); 842 } 843 /* 844 * If we have more packets queued, schedule next ready event 845 * (can only occur when bandwidth != 0, otherwise we would have 846 * flushed the whole queue in the previous loop). 847 * To this purpose we record the current time and compute how many 848 * ticks to go for the finish time of the packet. 849 */ 850 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ 851 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ 852 q->sched_time = curr_time ; 853 heap_insert(&ready_heap, curr_time + t, (void *)q ); 854 /* XXX should check errors on heap_insert, and drain the whole 855 * queue on error hoping next time we are luckier. 856 */ 857 } else { /* RED needs to know when the queue becomes empty */ 858 q->q_time = curr_time; 859 q->numbytes = 0; 860 } 861 /* 862 * If the delay line was empty call transmit_event(p) now. 863 * Otherwise, the scheduler will take care of it. 864 */ 865 if (p_was_empty) 866 transmit_event(p, head, tail); 867} 868 869/* 870 * Called when we can transmit packets on WF2Q queues. Take pkts out of 871 * the queues at their start time, and enqueue into the delay line. 872 * Packets are drained until p->numbytes < 0. As long as 873 * len_scaled >= p->numbytes, the packet goes into the delay line 874 * with a deadline p->delay. For the last packet, if p->numbytes<0, 875 * there is an additional delay. 876 */ 877static void 878ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail) 879{ 880 int p_was_empty = (p->head == NULL) ; 881 struct dn_heap *sch = &(p->scheduler_heap); 882 struct dn_heap *neh = &(p->not_eligible_heap) ; 883 int64_t p_numbytes = p->numbytes; 884 885 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 886 887 if (p->if_name[0] == 0) /* tx clock is simulated */ 888 p_numbytes += ( curr_time - p->sched_time ) * p->bandwidth; 889 else { /* tx clock is for real, the ifq must be empty or this is a NOP */ 890 if (p->ifp && !IFCQ_IS_EMPTY(&p->ifp->if_snd)) 891 return ; 892 else { 893 DPRINTF(("dummynet: pipe %d ready from %s --\n", 894 p->pipe_nr, p->if_name)); 895 } 896 } 897 898 /* 899 * While we have backlogged traffic AND credit, we need to do 900 * something on the queue. 901 */ 902 while ( p_numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { 903 if (sch->elements > 0) { /* have some eligible pkts to send out */ 904 struct dn_flow_queue *q = sch->p[0].object ; 905 struct mbuf *pkt = q->head; 906 struct dn_flow_set *fs = q->fs; 907 u_int64_t len = pkt->m_pkthdr.len; 908 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ; 909 910 heap_extract(sch, NULL); /* remove queue from heap */ 911 p_numbytes -= len_scaled ; 912 move_pkt(pkt, q, p, len); 913 914 p->V += (len<<MY_M) / p->sum ; /* update V */ 915 q->S = q->F ; /* update start time */ 916 if (q->len == 0) { /* Flow not backlogged any more */ 917 fs->backlogged-- ; 918 heap_insert(&(p->idle_heap), q->F, q); 919 } else { /* still backlogged */ 920 /* 921 * update F and position in backlogged queue, then 922 * put flow in not_eligible_heap (we will fix this later). 923 */ 924 len = (q->head)->m_pkthdr.len; 925 q->F += (len<<MY_M)/(u_int64_t) fs->weight ; 926 if (DN_KEY_LEQ(q->S, p->V)) 927 heap_insert(neh, q->S, q); 928 else 929 heap_insert(sch, q->F, q); 930 } 931 } 932 /* 933 * now compute V = max(V, min(S_i)). Remember that all elements in sch 934 * have by definition S_i <= V so if sch is not empty, V is surely 935 * the max and we must not update it. Conversely, if sch is empty 936 * we only need to look at neh. 937 */ 938 if (sch->elements == 0 && neh->elements > 0) 939 p->V = MAX64 ( p->V, neh->p[0].key ); 940 /* move from neh to sch any packets that have become eligible */ 941 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { 942 struct dn_flow_queue *q = neh->p[0].object ; 943 heap_extract(neh, NULL); 944 heap_insert(sch, q->F, q); 945 } 946 947 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ 948 p_numbytes = -1 ; /* mark not ready for I/O */ 949 break ; 950 } 951 } 952 if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0 953 && p->idle_heap.elements > 0) { 954 /* 955 * no traffic and no events scheduled. We can get rid of idle-heap. 956 */ 957 int i ; 958 959 for (i = 0 ; i < p->idle_heap.elements ; i++) { 960 struct dn_flow_queue *q = p->idle_heap.p[i].object ; 961 962 q->F = 0 ; 963 q->S = q->F + 1 ; 964 } 965 p->sum = 0 ; 966 p->V = 0 ; 967 p->idle_heap.elements = 0 ; 968 } 969 /* 970 * If we are getting clocks from dummynet (not a real interface) and 971 * If we are under credit, schedule the next ready event. 972 * Also fix the delivery time of the last packet. 973 */ 974 if (p->if_name[0]==0 && p_numbytes < 0) { /* this implies bandwidth >0 */ 975 dn_key t=0 ; /* number of ticks i have to wait */ 976 977 if (p->bandwidth > 0) 978 t = ( p->bandwidth -1 - p_numbytes) / p->bandwidth ; 979 dn_tag_get(p->tail)->dn_output_time += t ; 980 p->sched_time = curr_time ; 981 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); 982 /* XXX should check errors on heap_insert, and drain the whole 983 * queue on error hoping next time we are luckier. 984 */ 985 } 986 987 /* Fit (adjust if necessary) 64bit result into 32bit variable. */ 988 if (p_numbytes > INT_MAX) 989 p->numbytes = INT_MAX; 990 else if (p_numbytes < INT_MIN) 991 p->numbytes = INT_MIN; 992 else 993 p->numbytes = p_numbytes; 994 995 /* 996 * If the delay line was empty call transmit_event(p) now. 997 * Otherwise, the scheduler will take care of it. 998 */ 999 if (p_was_empty) 1000 transmit_event(p, head, tail); 1001 1002} 1003 1004/* 1005 * This is called every 1ms. It is used to 1006 * increment the current tick counter and schedule expired events. 1007 */ 1008static void 1009dummynet(__unused void * unused) 1010{ 1011 void *p ; /* generic parameter to handler */ 1012 struct dn_heap *h ; 1013 struct dn_heap *heaps[3]; 1014 struct mbuf *head = NULL, *tail = NULL; 1015 int i; 1016 struct dn_pipe *pe ; 1017 struct timespec ts; 1018 struct timeval tv; 1019 1020 heaps[0] = &ready_heap ; /* fixed-rate queues */ 1021 heaps[1] = &wfq_ready_heap ; /* wfq queues */ 1022 heaps[2] = &extract_heap ; /* delay line */ 1023 1024 lck_mtx_lock(dn_mutex); 1025 1026 /* make all time measurements in milliseconds (ms) - 1027 * here we convert secs and usecs to msecs (just divide the 1028 * usecs and take the closest whole number). 1029 */ 1030 microuptime(&tv); 1031 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 1032 1033 for (i=0; i < 3 ; i++) { 1034 h = heaps[i]; 1035 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { 1036 if (h->p[0].key > curr_time) 1037 printf("dummynet: warning, heap %d is %d ticks late\n", 1038 i, (int)(curr_time - h->p[0].key)); 1039 p = h->p[0].object ; /* store a copy before heap_extract */ 1040 heap_extract(h, NULL); /* need to extract before processing */ 1041 if (i == 0) 1042 ready_event(p, &head, &tail) ; 1043 else if (i == 1) { 1044 struct dn_pipe *pipe = p; 1045 if (pipe->if_name[0] != '\0') 1046 printf("dummynet: bad ready_event_wfq for pipe %s\n", 1047 pipe->if_name); 1048 else 1049 ready_event_wfq(p, &head, &tail) ; 1050 } else { 1051 transmit_event(p, &head, &tail); 1052 } 1053 } 1054 } 1055 /* sweep pipes trying to expire idle flow_queues */ 1056 for (i = 0; i < HASHSIZE; i++) 1057 SLIST_FOREACH(pe, &pipehash[i], next) 1058 if (pe->idle_heap.elements > 0 && 1059 DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) { 1060 struct dn_flow_queue *q = pe->idle_heap.p[0].object ; 1061 1062 heap_extract(&(pe->idle_heap), NULL); 1063 q->S = q->F + 1 ; /* mark timestamp as invalid */ 1064 pe->sum -= q->fs->weight ; 1065 } 1066 1067 /* check the heaps to see if there's still stuff in there, and 1068 * only set the timer if there are packets to process 1069 */ 1070 timer_enabled = 0; 1071 for (i=0; i < 3 ; i++) { 1072 h = heaps[i]; 1073 if (h->elements > 0) { // set the timer 1074 ts.tv_sec = 0; 1075 ts.tv_nsec = 1 * 1000000; // 1ms 1076 timer_enabled = 1; 1077 bsd_timeout(dummynet, NULL, &ts); 1078 break; 1079 } 1080 } 1081 1082 if (head != NULL) 1083 serialize++; 1084 1085 lck_mtx_unlock(dn_mutex); 1086 1087 /* Send out the de-queued list of ready-to-send packets */ 1088 if (head != NULL) { 1089 dummynet_send(head); 1090 lck_mtx_lock(dn_mutex); 1091 serialize--; 1092 lck_mtx_unlock(dn_mutex); 1093 } 1094} 1095 1096 1097static void 1098dummynet_send(struct mbuf *m) 1099{ 1100 struct dn_pkt_tag *pkt; 1101 struct mbuf *n; 1102 1103 for (; m != NULL; m = n) { 1104 n = m->m_nextpkt; 1105 m->m_nextpkt = NULL; 1106 pkt = dn_tag_get(m); 1107 1108 DPRINTF(("dummynet_send m: %p dn_dir: %d dn_flags: 0x%x\n", 1109 m, pkt->dn_dir, pkt->dn_flags)); 1110 1111 switch (pkt->dn_dir) { 1112 case DN_TO_IP_OUT: { 1113 struct route tmp_rt = pkt->dn_ro; 1114 /* Force IP_RAWOUTPUT as the IP header is fully formed */ 1115 pkt->dn_flags |= IP_RAWOUTPUT | IP_FORWARDING; 1116 (void)ip_output(m, NULL, &tmp_rt, pkt->dn_flags, NULL, NULL); 1117 if (tmp_rt.ro_rt) { 1118 rtfree(tmp_rt.ro_rt); 1119 tmp_rt.ro_rt = NULL; 1120 } 1121 break ; 1122 } 1123 case DN_TO_IP_IN : 1124 proto_inject(PF_INET, m); 1125 break ; 1126#ifdef INET6 1127 case DN_TO_IP6_OUT: { 1128 struct route_in6 ro6; 1129 1130 ro6 = pkt->dn_ro6; 1131 1132 ip6_output(m, NULL, &ro6, IPV6_FORWARDING, NULL, NULL, NULL); 1133 1134 if (ro6.ro_rt) 1135 rtfree(ro6.ro_rt); 1136 break; 1137 } 1138 case DN_TO_IP6_IN: 1139 proto_inject(PF_INET6, m); 1140 break; 1141#endif /* INET6 */ 1142 default: 1143 printf("dummynet: bad switch %d!\n", pkt->dn_dir); 1144 m_freem(m); 1145 break ; 1146 } 1147 } 1148} 1149 1150 1151 1152/* 1153 * called by an interface when tx_rdy occurs. 1154 */ 1155int 1156if_tx_rdy(struct ifnet *ifp) 1157{ 1158 struct dn_pipe *p; 1159 struct mbuf *head = NULL, *tail = NULL; 1160 int i; 1161 1162 lck_mtx_lock(dn_mutex); 1163 1164 for (i = 0; i < HASHSIZE; i++) 1165 SLIST_FOREACH(p, &pipehash[i], next) 1166 if (p->ifp == ifp) 1167 break ; 1168 if (p == NULL) { 1169 char buf[32]; 1170 snprintf(buf, sizeof(buf), "%s%d",ifp->if_name, ifp->if_unit); 1171 for (i = 0; i < HASHSIZE; i++) 1172 SLIST_FOREACH(p, &pipehash[i], next) 1173 if (!strcmp(p->if_name, buf) ) { 1174 p->ifp = ifp ; 1175 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf)); 1176 break ; 1177 } 1178 } 1179 if (p != NULL) { 1180 DPRINTF(("dummynet: ++ tx rdy from %s%d - qlen %d\n", ifp->if_name, 1181 ifp->if_unit, IFCQ_LEN(&ifp->if_snd))); 1182 p->numbytes = 0 ; /* mark ready for I/O */ 1183 ready_event_wfq(p, &head, &tail); 1184 } 1185 1186 if (head != NULL) { 1187 serialize++; 1188 } 1189 1190 lck_mtx_unlock(dn_mutex); 1191 1192 /* Send out the de-queued list of ready-to-send packets */ 1193 if (head != NULL) { 1194 dummynet_send(head); 1195 lck_mtx_lock(dn_mutex); 1196 serialize--; 1197 lck_mtx_unlock(dn_mutex); 1198 } 1199 return 0; 1200} 1201 1202/* 1203 * Unconditionally expire empty queues in case of shortage. 1204 * Returns the number of queues freed. 1205 */ 1206static int 1207expire_queues(struct dn_flow_set *fs) 1208{ 1209 struct dn_flow_queue *q, *prev ; 1210 int i, initial_elements = fs->rq_elements ; 1211 struct timeval timenow; 1212 1213 /* reviewed for getmicrotime usage */ 1214 getmicrotime(&timenow); 1215 1216 if (fs->last_expired == timenow.tv_sec) 1217 return 0 ; 1218 fs->last_expired = timenow.tv_sec ; 1219 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ 1220 for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) 1221 if (q->head != NULL || q->S != q->F+1) { 1222 prev = q ; 1223 q = q->next ; 1224 } else { /* entry is idle, expire it */ 1225 struct dn_flow_queue *old_q = q ; 1226 1227 if (prev != NULL) 1228 prev->next = q = q->next ; 1229 else 1230 fs->rq[i] = q = q->next ; 1231 fs->rq_elements-- ; 1232 FREE(old_q, M_DUMMYNET); 1233 } 1234 return initial_elements - fs->rq_elements ; 1235} 1236 1237/* 1238 * If room, create a new queue and put at head of slot i; 1239 * otherwise, create or use the default queue. 1240 */ 1241static struct dn_flow_queue * 1242create_queue(struct dn_flow_set *fs, int i) 1243{ 1244 struct dn_flow_queue *q ; 1245 1246 if (fs->rq_elements > fs->rq_size * dn_max_ratio && 1247 expire_queues(fs) == 0) { 1248 /* 1249 * No way to get room, use or create overflow queue. 1250 */ 1251 i = fs->rq_size ; 1252 if ( fs->rq[i] != NULL ) 1253 return fs->rq[i] ; 1254 } 1255 q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO); 1256 if (q == NULL) { 1257 printf("dummynet: sorry, cannot allocate queue for new flow\n"); 1258 return NULL ; 1259 } 1260 q->fs = fs ; 1261 q->hash_slot = i ; 1262 q->next = fs->rq[i] ; 1263 q->S = q->F + 1; /* hack - mark timestamp as invalid */ 1264 fs->rq[i] = q ; 1265 fs->rq_elements++ ; 1266 return q ; 1267} 1268 1269/* 1270 * Given a flow_set and a pkt in last_pkt, find a matching queue 1271 * after appropriate masking. The queue is moved to front 1272 * so that further searches take less time. 1273 */ 1274static struct dn_flow_queue * 1275find_queue(struct dn_flow_set *fs, struct ip_flow_id *id) 1276{ 1277 int i = 0 ; /* we need i and q for new allocations */ 1278 struct dn_flow_queue *q, *prev; 1279 int is_v6 = IS_IP6_FLOW_ID(id); 1280 1281 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) 1282 q = fs->rq[0] ; 1283 else { 1284 /* first, do the masking, then hash */ 1285 id->dst_port &= fs->flow_mask.dst_port ; 1286 id->src_port &= fs->flow_mask.src_port ; 1287 id->proto &= fs->flow_mask.proto ; 1288 id->flags = 0 ; /* we don't care about this one */ 1289 if (is_v6) { 1290 APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6); 1291 APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6); 1292 id->flow_id6 &= fs->flow_mask.flow_id6; 1293 1294 i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^ 1295 ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^ 1296 ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^ 1297 ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^ 1298 1299 ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^ 1300 ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^ 1301 ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^ 1302 ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^ 1303 1304 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^ 1305 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^ 1306 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^ 1307 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^ 1308 1309 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^ 1310 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^ 1311 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^ 1312 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^ 1313 1314 (id->dst_port << 1) ^ (id->src_port) ^ 1315 (id->proto ) ^ 1316 (id->flow_id6); 1317 } else { 1318 id->dst_ip &= fs->flow_mask.dst_ip ; 1319 id->src_ip &= fs->flow_mask.src_ip ; 1320 1321 i = ( (id->dst_ip) & 0xffff ) ^ 1322 ( (id->dst_ip >> 15) & 0xffff ) ^ 1323 ( (id->src_ip << 1) & 0xffff ) ^ 1324 ( (id->src_ip >> 16 ) & 0xffff ) ^ 1325 (id->dst_port << 1) ^ (id->src_port) ^ 1326 (id->proto ); 1327 } 1328 i = i % fs->rq_size ; 1329 /* finally, scan the current list for a match */ 1330 searches++ ; 1331 for (prev=NULL, q = fs->rq[i] ; q ; ) { 1332 search_steps++; 1333 if (is_v6 && 1334 IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) && 1335 IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) && 1336 id->dst_port == q->id.dst_port && 1337 id->src_port == q->id.src_port && 1338 id->proto == q->id.proto && 1339 id->flags == q->id.flags && 1340 id->flow_id6 == q->id.flow_id6) 1341 break ; /* found */ 1342 1343 if (!is_v6 && id->dst_ip == q->id.dst_ip && 1344 id->src_ip == q->id.src_ip && 1345 id->dst_port == q->id.dst_port && 1346 id->src_port == q->id.src_port && 1347 id->proto == q->id.proto && 1348 id->flags == q->id.flags) 1349 break ; /* found */ 1350 1351 /* No match. Check if we can expire the entry */ 1352 if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { 1353 /* entry is idle and not in any heap, expire it */ 1354 struct dn_flow_queue *old_q = q ; 1355 1356 if (prev != NULL) 1357 prev->next = q = q->next ; 1358 else 1359 fs->rq[i] = q = q->next ; 1360 fs->rq_elements-- ; 1361 FREE(old_q, M_DUMMYNET); 1362 continue ; 1363 } 1364 prev = q ; 1365 q = q->next ; 1366 } 1367 if (q && prev != NULL) { /* found and not in front */ 1368 prev->next = q->next ; 1369 q->next = fs->rq[i] ; 1370 fs->rq[i] = q ; 1371 } 1372 } 1373 if (q == NULL) { /* no match, need to allocate a new entry */ 1374 q = create_queue(fs, i); 1375 if (q != NULL) 1376 q->id = *id ; 1377 } 1378 return q ; 1379} 1380 1381static int 1382red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) 1383{ 1384 /* 1385 * RED algorithm 1386 * 1387 * RED calculates the average queue size (avg) using a low-pass filter 1388 * with an exponential weighted (w_q) moving average: 1389 * avg <- (1-w_q) * avg + w_q * q_size 1390 * where q_size is the queue length (measured in bytes or * packets). 1391 * 1392 * If q_size == 0, we compute the idle time for the link, and set 1393 * avg = (1 - w_q)^(idle/s) 1394 * where s is the time needed for transmitting a medium-sized packet. 1395 * 1396 * Now, if avg < min_th the packet is enqueued. 1397 * If avg > max_th the packet is dropped. Otherwise, the packet is 1398 * dropped with probability P function of avg. 1399 * 1400 */ 1401 1402 int64_t p_b = 0; 1403 /* queue in bytes or packets ? */ 1404 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; 1405 1406 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size)); 1407 1408 /* average queue size estimation */ 1409 if (q_size != 0) { 1410 /* 1411 * queue is not empty, avg <- avg + (q_size - avg) * w_q 1412 */ 1413 int diff = SCALE(q_size) - q->avg; 1414 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); 1415 1416 q->avg += (int) v; 1417 } else { 1418 /* 1419 * queue is empty, find for how long the queue has been 1420 * empty and use a lookup table for computing 1421 * (1 - * w_q)^(idle_time/s) where s is the time to send a 1422 * (small) packet. 1423 * XXX check wraps... 1424 */ 1425 if (q->avg) { 1426 u_int t = (curr_time - q->q_time) / fs->lookup_step; 1427 1428 q->avg = (t < fs->lookup_depth) ? 1429 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; 1430 } 1431 } 1432 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg))); 1433 1434 /* should i drop ? */ 1435 1436 if (q->avg < fs->min_th) { 1437 q->count = -1; 1438 return 0; /* accept packet ; */ 1439 } 1440 if (q->avg >= fs->max_th) { /* average queue >= max threshold */ 1441 if (fs->flags_fs & DN_IS_GENTLE_RED) { 1442 /* 1443 * According to Gentle-RED, if avg is greater than max_th the 1444 * packet is dropped with a probability 1445 * p_b = c_3 * avg - c_4 1446 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p 1447 */ 1448 p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; 1449 } else { 1450 q->count = -1; 1451 DPRINTF(("dummynet: - drop")); 1452 return 1 ; 1453 } 1454 } else if (q->avg > fs->min_th) { 1455 /* 1456 * we compute p_b using the linear dropping function p_b = c_1 * 1457 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = 1458 * max_p * min_th / (max_th - min_th) 1459 */ 1460 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; 1461 } 1462 if (fs->flags_fs & DN_QSIZE_IS_BYTES) 1463 p_b = (p_b * len) / fs->max_pkt_size; 1464 if (++q->count == 0) 1465 q->random = MY_RANDOM & 0xffff; 1466 else { 1467 /* 1468 * q->count counts packets arrived since last drop, so a greater 1469 * value of q->count means a greater packet drop probability. 1470 */ 1471 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { 1472 q->count = 0; 1473 DPRINTF(("dummynet: - red drop")); 1474 /* after a drop we calculate a new random value */ 1475 q->random = MY_RANDOM & 0xffff; 1476 return 1; /* drop */ 1477 } 1478 } 1479 /* end of RED algorithm */ 1480 return 0 ; /* accept */ 1481} 1482 1483static __inline 1484struct dn_flow_set * 1485locate_flowset(int fs_nr) 1486{ 1487 struct dn_flow_set *fs; 1488 SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next) 1489 if (fs->fs_nr == fs_nr) 1490 return fs ; 1491 1492 return (NULL); 1493} 1494 1495static __inline struct dn_pipe * 1496locate_pipe(int pipe_nr) 1497{ 1498 struct dn_pipe *pipe; 1499 1500 SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next) 1501 if (pipe->pipe_nr == pipe_nr) 1502 return (pipe); 1503 1504 return (NULL); 1505} 1506 1507 1508 1509/* 1510 * dummynet hook for packets. Below 'pipe' is a pipe or a queue 1511 * depending on whether WF2Q or fixed bw is used. 1512 * 1513 * pipe_nr pipe or queue the packet is destined for. 1514 * dir where shall we send the packet after dummynet. 1515 * m the mbuf with the packet 1516 * ifp the 'ifp' parameter from the caller. 1517 * NULL in ip_input, destination interface in ip_output, 1518 * real_dst in bdg_forward 1519 * ro route parameter (only used in ip_output, NULL otherwise) 1520 * dst destination address, only used by ip_output 1521 * rule matching rule, in case of multiple passes 1522 * flags flags from the caller, only used in ip_output 1523 * 1524 */ 1525static int 1526dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int client) 1527{ 1528 struct mbuf *head = NULL, *tail = NULL; 1529 struct dn_pkt_tag *pkt; 1530 struct m_tag *mtag; 1531 struct dn_flow_set *fs = NULL; 1532 struct dn_pipe *pipe ; 1533 u_int64_t len = m->m_pkthdr.len ; 1534 struct dn_flow_queue *q = NULL ; 1535 int is_pipe; 1536 struct timespec ts; 1537 struct timeval tv; 1538 1539 DPRINTF(("dummynet_io m: %p pipe: %d dir: %d client: %d\n", 1540 m, pipe_nr, dir, client)); 1541 1542#if IPFIREWALL 1543#if IPFW2 1544 if (client == DN_CLIENT_IPFW) { 1545 ipfw_insn *cmd = fwa->fwa_ipfw_rule->cmd + fwa->fwa_ipfw_rule->act_ofs; 1546 1547 if (cmd->opcode == O_LOG) 1548 cmd += F_LEN(cmd); 1549 is_pipe = (cmd->opcode == O_PIPE); 1550 } 1551#else 1552 if (client == DN_CLIENT_IPFW) 1553 is_pipe = (fwa->fwa_ipfw_rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE; 1554#endif 1555#endif /* IPFIREWALL */ 1556 1557#if DUMMYNET 1558 if (client == DN_CLIENT_PF) 1559 is_pipe = fwa->fwa_flags == DN_IS_PIPE ? 1 : 0; 1560#endif /* DUMMYNET */ 1561 1562 pipe_nr &= 0xffff ; 1563 1564 lck_mtx_lock(dn_mutex); 1565 1566 /* make all time measurements in milliseconds (ms) - 1567 * here we convert secs and usecs to msecs (just divide the 1568 * usecs and take the closest whole number). 1569 */ 1570 microuptime(&tv); 1571 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 1572 1573 /* 1574 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. 1575 */ 1576 if (is_pipe) { 1577 pipe = locate_pipe(pipe_nr); 1578 if (pipe != NULL) 1579 fs = &(pipe->fs); 1580 } else 1581 fs = locate_flowset(pipe_nr); 1582 1583 1584 if (fs == NULL){ 1585 goto dropit ; /* this queue/pipe does not exist! */ 1586 } 1587 pipe = fs->pipe ; 1588 if (pipe == NULL) { /* must be a queue, try find a matching pipe */ 1589 pipe = locate_pipe(fs->parent_nr); 1590 1591 if (pipe != NULL) 1592 fs->pipe = pipe ; 1593 else { 1594 printf("dummynet: no pipe %d for queue %d, drop pkt\n", 1595 fs->parent_nr, fs->fs_nr); 1596 goto dropit ; 1597 } 1598 } 1599 q = find_queue(fs, &(fwa->fwa_id)); 1600 if ( q == NULL ) 1601 goto dropit ; /* cannot allocate queue */ 1602 /* 1603 * update statistics, then check reasons to drop pkt 1604 */ 1605 q->tot_bytes += len ; 1606 q->tot_pkts++ ; 1607 if ( fs->plr && (MY_RANDOM < fs->plr) ) 1608 goto dropit ; /* random pkt drop */ 1609 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { 1610 if (q->len_bytes > fs->qsize) 1611 goto dropit ; /* queue size overflow */ 1612 } else { 1613 if (q->len >= fs->qsize) 1614 goto dropit ; /* queue count overflow */ 1615 } 1616 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) 1617 goto dropit ; 1618 1619 /* XXX expensive to zero, see if we can remove it*/ 1620 mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, 1621 sizeof(struct dn_pkt_tag), M_NOWAIT, m); 1622 if ( mtag == NULL ) 1623 goto dropit ; /* cannot allocate packet header */ 1624 m_tag_prepend(m, mtag); /* attach to mbuf chain */ 1625 1626 pkt = (struct dn_pkt_tag *)(mtag+1); 1627 bzero(pkt, sizeof(struct dn_pkt_tag)); 1628 /* ok, i can handle the pkt now... */ 1629 /* build and enqueue packet + parameters */ 1630 /* 1631 * PF is checked before ipfw so remember ipfw rule only when 1632 * the caller is ipfw. When the caller is PF, fwa_ipfw_rule 1633 * is a fake rule just used for convenience 1634 */ 1635 if (client == DN_CLIENT_IPFW) 1636 pkt->dn_ipfw_rule = fwa->fwa_ipfw_rule; 1637 pkt->dn_pf_rule = fwa->fwa_pf_rule; 1638 pkt->dn_dir = dir ; 1639 pkt->dn_client = client; 1640 1641 pkt->dn_ifp = fwa->fwa_oif; 1642 if (dir == DN_TO_IP_OUT) { 1643 /* 1644 * We need to copy *ro because for ICMP pkts (and maybe others) 1645 * the caller passed a pointer into the stack; dst might also be 1646 * a pointer into *ro so it needs to be updated. 1647 */ 1648 if (fwa->fwa_ro) { 1649 pkt->dn_ro = *(fwa->fwa_ro); 1650 if (fwa->fwa_ro->ro_rt) 1651 RT_ADDREF(fwa->fwa_ro->ro_rt); 1652 } 1653 if (fwa->fwa_dst) { 1654 if (fwa->fwa_dst == (struct sockaddr_in *)&fwa->fwa_ro->ro_dst) /* dst points into ro */ 1655 fwa->fwa_dst = (struct sockaddr_in *)&(pkt->dn_ro.ro_dst) ; 1656 1657 bcopy (fwa->fwa_dst, &pkt->dn_dst, sizeof(pkt->dn_dst)); 1658 } 1659 } else if (dir == DN_TO_IP6_OUT) { 1660 if (fwa->fwa_ro6) { 1661 pkt->dn_ro6 = *(fwa->fwa_ro6); 1662 if (fwa->fwa_ro6->ro_rt) 1663 RT_ADDREF(fwa->fwa_ro6->ro_rt); 1664 } 1665 if (fwa->fwa_ro6_pmtu) { 1666 pkt->dn_ro6_pmtu = *(fwa->fwa_ro6_pmtu); 1667 if (fwa->fwa_ro6_pmtu->ro_rt) 1668 RT_ADDREF(fwa->fwa_ro6_pmtu->ro_rt); 1669 } 1670 if (fwa->fwa_dst6) { 1671 if (fwa->fwa_dst6 == (struct sockaddr_in6 *)&fwa->fwa_ro6->ro_dst) /* dst points into ro */ 1672 fwa->fwa_dst6 = (struct sockaddr_in6 *)&(pkt->dn_ro6.ro_dst) ; 1673 1674 bcopy (fwa->fwa_dst6, &pkt->dn_dst6, sizeof(pkt->dn_dst6)); 1675 } 1676 pkt->dn_origifp = fwa->fwa_origifp; 1677 pkt->dn_mtu = fwa->fwa_mtu; 1678 pkt->dn_alwaysfrag = fwa->fwa_alwaysfrag; 1679 pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen; 1680 if (fwa->fwa_exthdrs) { 1681 bcopy (fwa->fwa_exthdrs, &pkt->dn_exthdrs, sizeof(pkt->dn_exthdrs)); 1682 /* 1683 * Need to zero out the source structure so the mbufs 1684 * won't be freed by ip6_output() 1685 */ 1686 bzero(fwa->fwa_exthdrs, sizeof(struct ip6_exthdrs)); 1687 } 1688 } 1689 if (dir == DN_TO_IP_OUT || dir == DN_TO_IP6_OUT) { 1690 pkt->dn_flags = fwa->fwa_oflags; 1691 if (fwa->fwa_ipoa != NULL) 1692 pkt->dn_ipoa = *(fwa->fwa_ipoa); 1693 } 1694 if (q->head == NULL) 1695 q->head = m; 1696 else 1697 q->tail->m_nextpkt = m; 1698 q->tail = m; 1699 q->len++; 1700 q->len_bytes += len ; 1701 1702 if ( q->head != m ) /* flow was not idle, we are done */ 1703 goto done; 1704 /* 1705 * If we reach this point the flow was previously idle, so we need 1706 * to schedule it. This involves different actions for fixed-rate or 1707 * WF2Q queues. 1708 */ 1709 if (is_pipe) { 1710 /* 1711 * Fixed-rate queue: just insert into the ready_heap. 1712 */ 1713 dn_key t = 0 ; 1714 if (pipe->bandwidth) 1715 t = SET_TICKS(m, q, pipe); 1716 q->sched_time = curr_time ; 1717 if (t == 0) /* must process it now */ 1718 ready_event( q , &head, &tail ); 1719 else 1720 heap_insert(&ready_heap, curr_time + t , q ); 1721 } else { 1722 /* 1723 * WF2Q. First, compute start time S: if the flow was idle (S=F+1) 1724 * set S to the virtual time V for the controlling pipe, and update 1725 * the sum of weights for the pipe; otherwise, remove flow from 1726 * idle_heap and set S to max(F,V). 1727 * Second, compute finish time F = S + len/weight. 1728 * Third, if pipe was idle, update V=max(S, V). 1729 * Fourth, count one more backlogged flow. 1730 */ 1731 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ 1732 q->S = pipe->V ; 1733 pipe->sum += fs->weight ; /* add weight of new queue */ 1734 } else { 1735 heap_extract(&(pipe->idle_heap), q); 1736 q->S = MAX64(q->F, pipe->V ) ; 1737 } 1738 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight; 1739 1740 if (pipe->not_eligible_heap.elements == 0 && 1741 pipe->scheduler_heap.elements == 0) 1742 pipe->V = MAX64 ( q->S, pipe->V ); 1743 fs->backlogged++ ; 1744 /* 1745 * Look at eligibility. A flow is not eligibile if S>V (when 1746 * this happens, it means that there is some other flow already 1747 * scheduled for the same pipe, so the scheduler_heap cannot be 1748 * empty). If the flow is not eligible we just store it in the 1749 * not_eligible_heap. Otherwise, we store in the scheduler_heap 1750 * and possibly invoke ready_event_wfq() right now if there is 1751 * leftover credit. 1752 * Note that for all flows in scheduler_heap (SCH), S_i <= V, 1753 * and for all flows in not_eligible_heap (NEH), S_i > V . 1754 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, 1755 * we only need to look into NEH. 1756 */ 1757 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ 1758 if (pipe->scheduler_heap.elements == 0) 1759 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n"); 1760 heap_insert(&(pipe->not_eligible_heap), q->S, q); 1761 } else { 1762 heap_insert(&(pipe->scheduler_heap), q->F, q); 1763 if (pipe->numbytes >= 0) { /* pipe is idle */ 1764 if (pipe->scheduler_heap.elements != 1) 1765 printf("dummynet: OUCH! pipe should have been idle!\n"); 1766 DPRINTF(("dummynet: waking up pipe %d at %d\n", 1767 pipe->pipe_nr, (int)(q->F >> MY_M))); 1768 pipe->sched_time = curr_time ; 1769 ready_event_wfq(pipe, &head, &tail); 1770 } 1771 } 1772 } 1773done: 1774 /* start the timer and set global if not already set */ 1775 if (!timer_enabled) { 1776 ts.tv_sec = 0; 1777 ts.tv_nsec = 1 * 1000000; // 1ms 1778 timer_enabled = 1; 1779 bsd_timeout(dummynet, NULL, &ts); 1780 } 1781 1782 lck_mtx_unlock(dn_mutex); 1783 1784 if (head != NULL) { 1785 dummynet_send(head); 1786 } 1787 1788 return 0; 1789 1790dropit: 1791 if (q) 1792 q->drops++ ; 1793 lck_mtx_unlock(dn_mutex); 1794 m_freem(m); 1795 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); 1796} 1797 1798/* 1799 * Below, the rtfree is only needed when (pkt->dn_dir == DN_TO_IP_OUT) 1800 * Doing this would probably save us the initial bzero of dn_pkt 1801 */ 1802#define DN_FREE_PKT(_m) do { \ 1803 struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \ 1804 if (tag) { \ 1805 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \ 1806 if (n->dn_ro.ro_rt != NULL) { \ 1807 rtfree(n->dn_ro.ro_rt); \ 1808 n->dn_ro.ro_rt = NULL; \ 1809 } \ 1810 } \ 1811 m_tag_delete(_m, tag); \ 1812 m_freem(_m); \ 1813} while (0) 1814 1815/* 1816 * Dispose all packets and flow_queues on a flow_set. 1817 * If all=1, also remove red lookup table and other storage, 1818 * including the descriptor itself. 1819 * For the one in dn_pipe MUST also cleanup ready_heap... 1820 */ 1821static void 1822purge_flow_set(struct dn_flow_set *fs, int all) 1823{ 1824 struct dn_flow_queue *q, *qn ; 1825 int i ; 1826 1827 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 1828 1829 for (i = 0 ; i <= fs->rq_size ; i++ ) { 1830 for (q = fs->rq[i] ; q ; q = qn ) { 1831 struct mbuf *m, *mnext; 1832 1833 mnext = q->head; 1834 while ((m = mnext) != NULL) { 1835 mnext = m->m_nextpkt; 1836 DN_FREE_PKT(m); 1837 } 1838 qn = q->next ; 1839 FREE(q, M_DUMMYNET); 1840 } 1841 fs->rq[i] = NULL ; 1842 } 1843 fs->rq_elements = 0 ; 1844 if (all) { 1845 /* RED - free lookup table */ 1846 if (fs->w_q_lookup) 1847 FREE(fs->w_q_lookup, M_DUMMYNET); 1848 if (fs->rq) 1849 FREE(fs->rq, M_DUMMYNET); 1850 /* if this fs is not part of a pipe, free it */ 1851 if (fs->pipe && fs != &(fs->pipe->fs) ) 1852 FREE(fs, M_DUMMYNET); 1853 } 1854} 1855 1856/* 1857 * Dispose all packets queued on a pipe (not a flow_set). 1858 * Also free all resources associated to a pipe, which is about 1859 * to be deleted. 1860 */ 1861static void 1862purge_pipe(struct dn_pipe *pipe) 1863{ 1864 struct mbuf *m, *mnext; 1865 1866 purge_flow_set( &(pipe->fs), 1 ); 1867 1868 mnext = pipe->head; 1869 while ((m = mnext) != NULL) { 1870 mnext = m->m_nextpkt; 1871 DN_FREE_PKT(m); 1872 } 1873 1874 heap_free( &(pipe->scheduler_heap) ); 1875 heap_free( &(pipe->not_eligible_heap) ); 1876 heap_free( &(pipe->idle_heap) ); 1877} 1878 1879/* 1880 * Delete all pipes and heaps returning memory. Must also 1881 * remove references from all ipfw rules to all pipes. 1882 */ 1883static void 1884dummynet_flush(void) 1885{ 1886 struct dn_pipe *pipe, *pipe1; 1887 struct dn_flow_set *fs, *fs1; 1888 int i; 1889 1890 lck_mtx_lock(dn_mutex); 1891 1892#if IPFW2 1893 /* remove all references to pipes ...*/ 1894 flush_pipe_ptrs(NULL); 1895#endif /* IPFW2 */ 1896 1897 /* Free heaps so we don't have unwanted events. */ 1898 heap_free(&ready_heap); 1899 heap_free(&wfq_ready_heap); 1900 heap_free(&extract_heap); 1901 1902 /* 1903 * Now purge all queued pkts and delete all pipes. 1904 * 1905 * XXXGL: can we merge the for(;;) cycles into one or not? 1906 */ 1907 for (i = 0; i < HASHSIZE; i++) 1908 SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) { 1909 SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next); 1910 purge_flow_set(fs, 1); 1911 } 1912 for (i = 0; i < HASHSIZE; i++) 1913 SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) { 1914 SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next); 1915 purge_pipe(pipe); 1916 FREE(pipe, M_DUMMYNET); 1917 } 1918 lck_mtx_unlock(dn_mutex); 1919} 1920 1921 1922static void 1923dn_ipfw_rule_delete_fs(struct dn_flow_set *fs, void *r) 1924{ 1925 int i ; 1926 struct dn_flow_queue *q ; 1927 struct mbuf *m ; 1928 1929 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ 1930 for (q = fs->rq[i] ; q ; q = q->next ) 1931 for (m = q->head ; m ; m = m->m_nextpkt ) { 1932 struct dn_pkt_tag *pkt = dn_tag_get(m) ; 1933 if (pkt->dn_ipfw_rule == r) 1934 pkt->dn_ipfw_rule = &default_rule ; 1935 } 1936} 1937/* 1938 * when a firewall rule is deleted, scan all queues and remove the flow-id 1939 * from packets matching this rule. 1940 */ 1941void 1942dn_ipfw_rule_delete(void *r) 1943{ 1944 struct dn_pipe *p ; 1945 struct dn_flow_set *fs ; 1946 struct dn_pkt_tag *pkt ; 1947 struct mbuf *m ; 1948 int i; 1949 1950 lck_mtx_lock(dn_mutex); 1951 1952 /* 1953 * If the rule references a queue (dn_flow_set), then scan 1954 * the flow set, otherwise scan pipes. Should do either, but doing 1955 * both does not harm. 1956 */ 1957 for (i = 0; i < HASHSIZE; i++) 1958 SLIST_FOREACH(fs, &flowsethash[i], next) 1959 dn_ipfw_rule_delete_fs(fs, r); 1960 1961 for (i = 0; i < HASHSIZE; i++) 1962 SLIST_FOREACH(p, &pipehash[i], next) { 1963 fs = &(p->fs); 1964 dn_ipfw_rule_delete_fs(fs, r); 1965 for (m = p->head ; m ; m = m->m_nextpkt ) { 1966 pkt = dn_tag_get(m); 1967 if (pkt->dn_ipfw_rule == r) 1968 pkt->dn_ipfw_rule = &default_rule; 1969 } 1970 } 1971 lck_mtx_unlock(dn_mutex); 1972} 1973 1974/* 1975 * setup RED parameters 1976 */ 1977static int 1978config_red(struct dn_flow_set *p, struct dn_flow_set * x) 1979{ 1980 int i; 1981 1982 x->w_q = p->w_q; 1983 x->min_th = SCALE(p->min_th); 1984 x->max_th = SCALE(p->max_th); 1985 x->max_p = p->max_p; 1986 1987 x->c_1 = p->max_p / (p->max_th - p->min_th); 1988 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); 1989 if (x->flags_fs & DN_IS_GENTLE_RED) { 1990 x->c_3 = (SCALE(1) - p->max_p) / p->max_th; 1991 x->c_4 = (SCALE(1) - 2 * p->max_p); 1992 } 1993 1994 /* if the lookup table already exist, free and create it again */ 1995 if (x->w_q_lookup) { 1996 FREE(x->w_q_lookup, M_DUMMYNET); 1997 x->w_q_lookup = NULL ; 1998 } 1999 if (red_lookup_depth == 0) { 2000 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n"); 2001 FREE(x, M_DUMMYNET); 2002 return EINVAL; 2003 } 2004 x->lookup_depth = red_lookup_depth; 2005 x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int), 2006 M_DUMMYNET, M_DONTWAIT); 2007 if (x->w_q_lookup == NULL) { 2008 printf("dummynet: sorry, cannot allocate red lookup table\n"); 2009 FREE(x, M_DUMMYNET); 2010 return ENOSPC; 2011 } 2012 2013 /* fill the lookup table with (1 - w_q)^x */ 2014 x->lookup_step = p->lookup_step ; 2015 x->lookup_weight = p->lookup_weight ; 2016 x->w_q_lookup[0] = SCALE(1) - x->w_q; 2017 for (i = 1; i < x->lookup_depth; i++) 2018 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); 2019 if (red_avg_pkt_size < 1) 2020 red_avg_pkt_size = 512 ; 2021 x->avg_pkt_size = red_avg_pkt_size ; 2022 if (red_max_pkt_size < 1) 2023 red_max_pkt_size = 1500 ; 2024 x->max_pkt_size = red_max_pkt_size ; 2025 return 0 ; 2026} 2027 2028static int 2029alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) 2030{ 2031 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ 2032 int l = pfs->rq_size; 2033 2034 if (l == 0) 2035 l = dn_hash_size; 2036 if (l < 4) 2037 l = 4; 2038 else if (l > DN_MAX_HASH_SIZE) 2039 l = DN_MAX_HASH_SIZE; 2040 x->rq_size = l; 2041 } else /* one is enough for null mask */ 2042 x->rq_size = 1; 2043 x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *), 2044 M_DUMMYNET, M_DONTWAIT | M_ZERO); 2045 if (x->rq == NULL) { 2046 printf("dummynet: sorry, cannot allocate queue\n"); 2047 return ENOSPC; 2048 } 2049 x->rq_elements = 0; 2050 return 0 ; 2051} 2052 2053static void 2054set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) 2055{ 2056 x->flags_fs = src->flags_fs; 2057 x->qsize = src->qsize; 2058 x->plr = src->plr; 2059 x->flow_mask = src->flow_mask; 2060 if (x->flags_fs & DN_QSIZE_IS_BYTES) { 2061 if (x->qsize > 1024*1024) 2062 x->qsize = 1024*1024 ; 2063 } else { 2064 if (x->qsize == 0) 2065 x->qsize = 50 ; 2066 if (x->qsize > 100) 2067 x->qsize = 50 ; 2068 } 2069 /* configuring RED */ 2070 if ( x->flags_fs & DN_IS_RED ) 2071 config_red(src, x) ; /* XXX should check errors */ 2072} 2073 2074/* 2075 * setup pipe or queue parameters. 2076 */ 2077 2078static int 2079config_pipe(struct dn_pipe *p) 2080{ 2081 int i, r; 2082 struct dn_flow_set *pfs = &(p->fs); 2083 struct dn_flow_queue *q; 2084 2085 /* 2086 * The config program passes parameters as follows: 2087 * bw = bits/second (0 means no limits), 2088 * delay = ms, must be translated into ticks. 2089 * qsize = slots/bytes 2090 */ 2091 p->delay = ( p->delay * (hz*10) ) / 1000 ; 2092 /* We need either a pipe number or a flow_set number */ 2093 if (p->pipe_nr == 0 && pfs->fs_nr == 0) 2094 return EINVAL ; 2095 if (p->pipe_nr != 0 && pfs->fs_nr != 0) 2096 return EINVAL ; 2097 if (p->pipe_nr != 0) { /* this is a pipe */ 2098 struct dn_pipe *x, *b; 2099 2100 lck_mtx_lock(dn_mutex); 2101 2102 /* locate pipe */ 2103 b = locate_pipe(p->pipe_nr); 2104 2105 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ 2106 x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ; 2107 if (x == NULL) { 2108 lck_mtx_unlock(dn_mutex); 2109 printf("dummynet: no memory for new pipe\n"); 2110 return ENOSPC; 2111 } 2112 x->pipe_nr = p->pipe_nr; 2113 x->fs.pipe = x ; 2114 /* idle_heap is the only one from which we extract from the middle. 2115 */ 2116 x->idle_heap.size = x->idle_heap.elements = 0 ; 2117 x->idle_heap.offset=offsetof(struct dn_flow_queue, heap_pos); 2118 } else { 2119 x = b; 2120 /* Flush accumulated credit for all queues */ 2121 for (i = 0; i <= x->fs.rq_size; i++) 2122 for (q = x->fs.rq[i]; q; q = q->next) 2123 q->numbytes = 0; 2124 } 2125 2126 x->bandwidth = p->bandwidth ; 2127 x->numbytes = 0; /* just in case... */ 2128 bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); 2129 x->ifp = NULL ; /* reset interface ptr */ 2130 x->delay = p->delay ; 2131 set_fs_parms(&(x->fs), pfs); 2132 2133 2134 if ( x->fs.rq == NULL ) { /* a new pipe */ 2135 r = alloc_hash(&(x->fs), pfs) ; 2136 if (r) { 2137 lck_mtx_unlock(dn_mutex); 2138 FREE(x, M_DUMMYNET); 2139 return r ; 2140 } 2141 SLIST_INSERT_HEAD(&pipehash[HASH(x->pipe_nr)], 2142 x, next); 2143 } 2144 lck_mtx_unlock(dn_mutex); 2145 } else { /* config queue */ 2146 struct dn_flow_set *x, *b ; 2147 2148 lck_mtx_lock(dn_mutex); 2149 /* locate flow_set */ 2150 b = locate_flowset(pfs->fs_nr); 2151 2152 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ 2153 if (pfs->parent_nr == 0) { /* need link to a pipe */ 2154 lck_mtx_unlock(dn_mutex); 2155 return EINVAL ; 2156 } 2157 x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO); 2158 if (x == NULL) { 2159 lck_mtx_unlock(dn_mutex); 2160 printf("dummynet: no memory for new flow_set\n"); 2161 return ENOSPC; 2162 } 2163 x->fs_nr = pfs->fs_nr; 2164 x->parent_nr = pfs->parent_nr; 2165 x->weight = pfs->weight ; 2166 if (x->weight == 0) 2167 x->weight = 1 ; 2168 else if (x->weight > 100) 2169 x->weight = 100 ; 2170 } else { 2171 /* Change parent pipe not allowed; must delete and recreate */ 2172 if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) { 2173 lck_mtx_unlock(dn_mutex); 2174 return EINVAL ; 2175 } 2176 x = b; 2177 } 2178 set_fs_parms(x, pfs); 2179 2180 if ( x->rq == NULL ) { /* a new flow_set */ 2181 r = alloc_hash(x, pfs) ; 2182 if (r) { 2183 lck_mtx_unlock(dn_mutex); 2184 FREE(x, M_DUMMYNET); 2185 return r ; 2186 } 2187 SLIST_INSERT_HEAD(&flowsethash[HASH(x->fs_nr)], 2188 x, next); 2189 } 2190 lck_mtx_unlock(dn_mutex); 2191 } 2192 return 0 ; 2193} 2194 2195/* 2196 * Helper function to remove from a heap queues which are linked to 2197 * a flow_set about to be deleted. 2198 */ 2199static void 2200fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) 2201{ 2202 int i = 0, found = 0 ; 2203 for (; i < h->elements ;) 2204 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { 2205 h->elements-- ; 2206 h->p[i] = h->p[h->elements] ; 2207 found++ ; 2208 } else 2209 i++ ; 2210 if (found) 2211 heapify(h); 2212} 2213 2214/* 2215 * helper function to remove a pipe from a heap (can be there at most once) 2216 */ 2217static void 2218pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) 2219{ 2220 if (h->elements > 0) { 2221 int i = 0 ; 2222 for (i=0; i < h->elements ; i++ ) { 2223 if (h->p[i].object == p) { /* found it */ 2224 h->elements-- ; 2225 h->p[i] = h->p[h->elements] ; 2226 heapify(h); 2227 break ; 2228 } 2229 } 2230 } 2231} 2232 2233/* 2234 * drain all queues. Called in case of severe mbuf shortage. 2235 */ 2236void 2237dummynet_drain(void) 2238{ 2239 struct dn_flow_set *fs; 2240 struct dn_pipe *p; 2241 struct mbuf *m, *mnext; 2242 int i; 2243 2244 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 2245 2246 heap_free(&ready_heap); 2247 heap_free(&wfq_ready_heap); 2248 heap_free(&extract_heap); 2249 /* remove all references to this pipe from flow_sets */ 2250 for (i = 0; i < HASHSIZE; i++) 2251 SLIST_FOREACH(fs, &flowsethash[i], next) 2252 purge_flow_set(fs, 0); 2253 2254 for (i = 0; i < HASHSIZE; i++) 2255 SLIST_FOREACH(p, &pipehash[i], next) { 2256 purge_flow_set(&(p->fs), 0); 2257 2258 mnext = p->head; 2259 while ((m = mnext) != NULL) { 2260 mnext = m->m_nextpkt; 2261 DN_FREE_PKT(m); 2262 } 2263 p->head = p->tail = NULL ; 2264 } 2265} 2266 2267/* 2268 * Fully delete a pipe or a queue, cleaning up associated info. 2269 */ 2270static int 2271delete_pipe(struct dn_pipe *p) 2272{ 2273 if (p->pipe_nr == 0 && p->fs.fs_nr == 0) 2274 return EINVAL ; 2275 if (p->pipe_nr != 0 && p->fs.fs_nr != 0) 2276 return EINVAL ; 2277 if (p->pipe_nr != 0) { /* this is an old-style pipe */ 2278 struct dn_pipe *b; 2279 struct dn_flow_set *fs; 2280 int i; 2281 2282 lck_mtx_lock(dn_mutex); 2283 /* locate pipe */ 2284 b = locate_pipe(p->pipe_nr); 2285 if(b == NULL){ 2286 lck_mtx_unlock(dn_mutex); 2287 return EINVAL ; /* not found */ 2288 } 2289 2290 /* Unlink from list of pipes. */ 2291 SLIST_REMOVE(&pipehash[HASH(b->pipe_nr)], b, dn_pipe, next); 2292 2293#if IPFW2 2294 /* remove references to this pipe from the ip_fw rules. */ 2295 flush_pipe_ptrs(&(b->fs)); 2296#endif /* IPFW2 */ 2297 2298 /* Remove all references to this pipe from flow_sets. */ 2299 for (i = 0; i < HASHSIZE; i++) 2300 SLIST_FOREACH(fs, &flowsethash[i], next) 2301 if (fs->pipe == b) { 2302 printf("dummynet: ++ ref to pipe %d from fs %d\n", 2303 p->pipe_nr, fs->fs_nr); 2304 fs->pipe = NULL ; 2305 purge_flow_set(fs, 0); 2306 } 2307 fs_remove_from_heap(&ready_heap, &(b->fs)); 2308 2309 purge_pipe(b); /* remove all data associated to this pipe */ 2310 /* remove reference to here from extract_heap and wfq_ready_heap */ 2311 pipe_remove_from_heap(&extract_heap, b); 2312 pipe_remove_from_heap(&wfq_ready_heap, b); 2313 lck_mtx_unlock(dn_mutex); 2314 2315 FREE(b, M_DUMMYNET); 2316 } else { /* this is a WF2Q queue (dn_flow_set) */ 2317 struct dn_flow_set *b; 2318 2319 lck_mtx_lock(dn_mutex); 2320 /* locate set */ 2321 b = locate_flowset(p->fs.fs_nr); 2322 if (b == NULL) { 2323 lck_mtx_unlock(dn_mutex); 2324 return EINVAL ; /* not found */ 2325 } 2326 2327#if IPFW2 2328 /* remove references to this flow_set from the ip_fw rules. */ 2329 flush_pipe_ptrs(b); 2330#endif /* IPFW2 */ 2331 2332 /* Unlink from list of flowsets. */ 2333 SLIST_REMOVE( &flowsethash[HASH(b->fs_nr)], b, dn_flow_set, next); 2334 2335 if (b->pipe != NULL) { 2336 /* Update total weight on parent pipe and cleanup parent heaps */ 2337 b->pipe->sum -= b->weight * b->backlogged ; 2338 fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); 2339 fs_remove_from_heap(&(b->pipe->scheduler_heap), b); 2340#if 1 /* XXX should i remove from idle_heap as well ? */ 2341 fs_remove_from_heap(&(b->pipe->idle_heap), b); 2342#endif 2343 } 2344 purge_flow_set(b, 1); 2345 lck_mtx_unlock(dn_mutex); 2346 } 2347 return 0 ; 2348} 2349 2350/* 2351 * helper function used to copy data from kernel in DUMMYNET_GET 2352 */ 2353static 2354char* dn_copy_set_32(struct dn_flow_set *set, char *bp) 2355{ 2356 int i, copied = 0 ; 2357 struct dn_flow_queue *q; 2358 struct dn_flow_queue_32 *qp = (struct dn_flow_queue_32 *)bp; 2359 2360 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 2361 2362 for (i = 0 ; i <= set->rq_size ; i++) 2363 for (q = set->rq[i] ; q ; q = q->next, qp++ ) { 2364 if (q->hash_slot != i) 2365 printf("dummynet: ++ at %d: wrong slot (have %d, " 2366 "should be %d)\n", copied, q->hash_slot, i); 2367 if (q->fs != set) 2368 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n", 2369 i, q->fs, set); 2370 copied++ ; 2371 cp_queue_to_32_user( q, qp ); 2372 /* cleanup pointers */ 2373 qp->next = (user32_addr_t)0 ; 2374 qp->head = qp->tail = (user32_addr_t)0 ; 2375 qp->fs = (user32_addr_t)0 ; 2376 } 2377 if (copied != set->rq_elements) 2378 printf("dummynet: ++ wrong count, have %d should be %d\n", 2379 copied, set->rq_elements); 2380 return (char *)qp ; 2381} 2382 2383static 2384char* dn_copy_set_64(struct dn_flow_set *set, char *bp) 2385{ 2386 int i, copied = 0 ; 2387 struct dn_flow_queue *q; 2388 struct dn_flow_queue_64 *qp = (struct dn_flow_queue_64 *)bp; 2389 2390 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 2391 2392 for (i = 0 ; i <= set->rq_size ; i++) 2393 for (q = set->rq[i] ; q ; q = q->next, qp++ ) { 2394 if (q->hash_slot != i) 2395 printf("dummynet: ++ at %d: wrong slot (have %d, " 2396 "should be %d)\n", copied, q->hash_slot, i); 2397 if (q->fs != set) 2398 printf("dummynet: ++ at %d: wrong fs ptr (have %p, should be %p)\n", 2399 i, q->fs, set); 2400 copied++ ; 2401 //bcopy(q, qp, sizeof(*q)); 2402 cp_queue_to_64_user( q, qp ); 2403 /* cleanup pointers */ 2404 qp->next = USER_ADDR_NULL ; 2405 qp->head = qp->tail = USER_ADDR_NULL ; 2406 qp->fs = USER_ADDR_NULL ; 2407 } 2408 if (copied != set->rq_elements) 2409 printf("dummynet: ++ wrong count, have %d should be %d\n", 2410 copied, set->rq_elements); 2411 return (char *)qp ; 2412} 2413 2414static size_t 2415dn_calc_size(int is64user) 2416{ 2417 struct dn_flow_set *set ; 2418 struct dn_pipe *p ; 2419 size_t size = 0 ; 2420 size_t pipesize; 2421 size_t queuesize; 2422 size_t setsize; 2423 int i; 2424 2425 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 2426 if ( is64user ){ 2427 pipesize = sizeof(struct dn_pipe_64); 2428 queuesize = sizeof(struct dn_flow_queue_64); 2429 setsize = sizeof(struct dn_flow_set_64); 2430 } 2431 else { 2432 pipesize = sizeof(struct dn_pipe_32); 2433 queuesize = sizeof( struct dn_flow_queue_32 ); 2434 setsize = sizeof(struct dn_flow_set_32); 2435 } 2436 /* 2437 * compute size of data structures: list of pipes and flow_sets. 2438 */ 2439 for (i = 0; i < HASHSIZE; i++) { 2440 SLIST_FOREACH(p, &pipehash[i], next) 2441 size += sizeof(*p) + 2442 p->fs.rq_elements * sizeof(struct dn_flow_queue); 2443 SLIST_FOREACH(set, &flowsethash[i], next) 2444 size += sizeof (*set) + 2445 set->rq_elements * sizeof(struct dn_flow_queue); 2446 } 2447 return size; 2448} 2449 2450static int 2451dummynet_get(struct sockopt *sopt) 2452{ 2453 char *buf, *bp=NULL; /* bp is the "copy-pointer" */ 2454 size_t size ; 2455 struct dn_flow_set *set ; 2456 struct dn_pipe *p ; 2457 int error=0, i ; 2458 int is64user = 0; 2459 2460 /* XXX lock held too long */ 2461 lck_mtx_lock(dn_mutex); 2462 /* 2463 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we 2464 * cannot use this flag while holding a mutex. 2465 */ 2466 if (proc_is64bit(sopt->sopt_p)) 2467 is64user = 1; 2468 for (i = 0; i < 10; i++) { 2469 size = dn_calc_size(is64user); 2470 lck_mtx_unlock(dn_mutex); 2471 buf = _MALLOC(size, M_TEMP, M_WAITOK); 2472 if (buf == NULL) 2473 return ENOBUFS; 2474 lck_mtx_lock(dn_mutex); 2475 if (size == dn_calc_size(is64user)) 2476 break; 2477 FREE(buf, M_TEMP); 2478 buf = NULL; 2479 } 2480 if (buf == NULL) { 2481 lck_mtx_unlock(dn_mutex); 2482 return ENOBUFS ; 2483 } 2484 2485 2486 bp = buf; 2487 for (i = 0; i < HASHSIZE; i++) 2488 SLIST_FOREACH(p, &pipehash[i], next) { 2489 /* 2490 * copy pipe descriptor into *bp, convert delay back to ms, 2491 * then copy the flow_set descriptor(s) one at a time. 2492 * After each flow_set, copy the queue descriptor it owns. 2493 */ 2494 if ( is64user ){ 2495 bp = cp_pipe_to_64_user(p, (struct dn_pipe_64 *)bp); 2496 } 2497 else{ 2498 bp = cp_pipe_to_32_user(p, (struct dn_pipe_32 *)bp); 2499 } 2500 } 2501 for (i = 0; i < HASHSIZE; i++) 2502 SLIST_FOREACH(set, &flowsethash[i], next) { 2503 struct dn_flow_set_64 *fs_bp = (struct dn_flow_set_64 *)bp ; 2504 cp_flow_set_to_64_user(set, fs_bp); 2505 /* XXX same hack as above */ 2506 fs_bp->next = CAST_DOWN(user64_addr_t, DN_IS_QUEUE); 2507 fs_bp->pipe = USER_ADDR_NULL; 2508 fs_bp->rq = USER_ADDR_NULL ; 2509 bp += sizeof(struct dn_flow_set_64); 2510 bp = dn_copy_set_64( set, bp ); 2511 } 2512 lck_mtx_unlock(dn_mutex); 2513 2514 error = sooptcopyout(sopt, buf, size); 2515 FREE(buf, M_TEMP); 2516 return error ; 2517} 2518 2519/* 2520 * Handler for the various dummynet socket options (get, flush, config, del) 2521 */ 2522static int 2523ip_dn_ctl(struct sockopt *sopt) 2524{ 2525 int error = 0 ; 2526 struct dn_pipe *p, tmp_pipe; 2527 2528 /* Disallow sets in really-really secure mode. */ 2529 if (sopt->sopt_dir == SOPT_SET && securelevel >= 3) 2530 return (EPERM); 2531 2532 switch (sopt->sopt_name) { 2533 default : 2534 printf("dummynet: -- unknown option %d", sopt->sopt_name); 2535 return EINVAL ; 2536 2537 case IP_DUMMYNET_GET : 2538 error = dummynet_get(sopt); 2539 break ; 2540 2541 case IP_DUMMYNET_FLUSH : 2542 dummynet_flush() ; 2543 break ; 2544 2545 case IP_DUMMYNET_CONFIGURE : 2546 p = &tmp_pipe ; 2547 if (proc_is64bit(sopt->sopt_p)) 2548 error = cp_pipe_from_user_64( sopt, p ); 2549 else 2550 error = cp_pipe_from_user_32( sopt, p ); 2551 2552 if (error) 2553 break ; 2554 error = config_pipe(p); 2555 break ; 2556 2557 case IP_DUMMYNET_DEL : /* remove a pipe or queue */ 2558 p = &tmp_pipe ; 2559 if (proc_is64bit(sopt->sopt_p)) 2560 error = cp_pipe_from_user_64( sopt, p ); 2561 else 2562 error = cp_pipe_from_user_32( sopt, p ); 2563 if (error) 2564 break ; 2565 2566 error = delete_pipe(p); 2567 break ; 2568 } 2569 return error ; 2570} 2571 2572void 2573ip_dn_init(void) 2574{ 2575 /* setup locks */ 2576 dn_mutex_grp_attr = lck_grp_attr_alloc_init(); 2577 dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr); 2578 dn_mutex_attr = lck_attr_alloc_init(); 2579 lck_mtx_init(dn_mutex, dn_mutex_grp, dn_mutex_attr); 2580 2581 ready_heap.size = ready_heap.elements = 0 ; 2582 ready_heap.offset = 0 ; 2583 2584 wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; 2585 wfq_ready_heap.offset = 0 ; 2586 2587 extract_heap.size = extract_heap.elements = 0 ; 2588 extract_heap.offset = 0 ; 2589 ip_dn_ctl_ptr = ip_dn_ctl; 2590 ip_dn_io_ptr = dummynet_io; 2591 2592 bzero(&default_rule, sizeof default_rule); 2593 2594 default_rule.act_ofs = 0; 2595 default_rule.rulenum = IPFW_DEFAULT_RULE; 2596 default_rule.cmd_len = 1; 2597 default_rule.set = RESVD_SET; 2598 2599 default_rule.cmd[0].len = 1; 2600 default_rule.cmd[0].opcode = 2601#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT 2602 1 ? O_ACCEPT : 2603#endif 2604 O_DENY; 2605} 2606