1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28/* 29 * Copyright (c) 1998-2002 Luigi Rizzo, Universita` di Pisa 30 * Portions Copyright (c) 2000 Akamba Corp. 31 * All rights reserved 32 * 33 * Redistribution and use in source and binary forms, with or without 34 * modification, are permitted provided that the following conditions 35 * are met: 36 * 1. Redistributions of source code must retain the above copyright 37 * notice, this list of conditions and the following disclaimer. 38 * 2. Redistributions in binary form must reproduce the above copyright 39 * notice, this list of conditions and the following disclaimer in the 40 * documentation and/or other materials provided with the distribution. 41 * 42 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 43 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 44 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 45 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 46 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 47 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 48 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 49 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 50 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 51 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 52 * SUCH DAMAGE. 53 * 54 * $FreeBSD: src/sys/netinet/ip_dummynet.c,v 1.84 2004/08/25 09:31:30 pjd Exp $ 55 */ 56 57#define DUMMYNET_DEBUG 58 59/* 60 * This module implements IP dummynet, a bandwidth limiter/delay emulator 61 * used in conjunction with the ipfw package. 62 * Description of the data structures used is in ip_dummynet.h 63 * Here you mainly find the following blocks of code: 64 * + variable declarations; 65 * + heap management functions; 66 * + scheduler and dummynet functions; 67 * + configuration and initialization. 68 * 69 * NOTA BENE: critical sections are protected by the "dummynet lock". 70 * 71 * Most important Changes: 72 * 73 * 010124: Fixed WF2Q behaviour 74 * 010122: Fixed spl protection. 75 * 000601: WF2Q support 76 * 000106: large rewrite, use heaps to handle very many pipes. 77 * 980513: initial release 78 * 79 * include files marked with XXX are probably not needed 80 */ 81 82#include <sys/param.h> 83#include <sys/systm.h> 84#include <sys/malloc.h> 85#include <sys/mbuf.h> 86#include <sys/queue.h> /* XXX */ 87#include <sys/kernel.h> 88#include <sys/socket.h> 89#include <sys/socketvar.h> 90#include <sys/time.h> 91#include <sys/sysctl.h> 92#include <net/if.h> 93#include <net/route.h> 94#include <net/kpi_protocol.h> 95#if DUMMYNET 96#include <net/kpi_protocol.h> 97#endif /* DUMMYNET */ 98#include <netinet/in.h> 99#include <netinet/in_systm.h> 100#include <netinet/in_var.h> 101#include <netinet/ip.h> 102#include <netinet/ip_fw.h> 103#include <netinet/ip_dummynet.h> 104#include <netinet/ip_var.h> 105 106#include <netinet/ip6.h> /* for ip6_input, ip6_output prototypes */ 107#include <netinet6/ip6_var.h> 108 109static struct ip_fw default_rule; 110 111/* 112 * We keep a private variable for the simulation time, but we could 113 * probably use an existing one ("softticks" in sys/kern/kern_timer.c) 114 */ 115static dn_key curr_time = 0 ; /* current simulation time */ 116 117/* this is for the timer that fires to call dummynet() - we only enable the timer when 118 there are packets to process, otherwise it's disabled */ 119static int timer_enabled = 0; 120 121static int dn_hash_size = 64 ; /* default hash size */ 122 123/* statistics on number of queue searches and search steps */ 124static int searches, search_steps ; 125static int pipe_expire = 1 ; /* expire queue if empty */ 126static int dn_max_ratio = 16 ; /* max queues/buckets ratio */ 127 128static int red_lookup_depth = 256; /* RED - default lookup table depth */ 129static int red_avg_pkt_size = 512; /* RED - default medium packet size */ 130static int red_max_pkt_size = 1500; /* RED - default max packet size */ 131 132static int serialize = 0; 133 134/* 135 * Three heaps contain queues and pipes that the scheduler handles: 136 * 137 * ready_heap contains all dn_flow_queue related to fixed-rate pipes. 138 * 139 * wfq_ready_heap contains the pipes associated with WF2Q flows 140 * 141 * extract_heap contains pipes associated with delay lines. 142 * 143 */ 144static struct dn_heap ready_heap, extract_heap, wfq_ready_heap ; 145 146static int heap_init(struct dn_heap *h, int size) ; 147static int heap_insert (struct dn_heap *h, dn_key key1, void *p); 148static void heap_extract(struct dn_heap *h, void *obj); 149 150 151static void transmit_event(struct dn_pipe *pipe, struct mbuf **head, 152 struct mbuf **tail); 153static void ready_event(struct dn_flow_queue *q, struct mbuf **head, 154 struct mbuf **tail); 155static void ready_event_wfq(struct dn_pipe *p, struct mbuf **head, 156 struct mbuf **tail); 157 158/* 159 * Packets are retrieved from queues in Dummynet in chains instead of 160 * packet-by-packet. The entire list of packets is first dequeued and 161 * sent out by the following function. 162 */ 163static void dummynet_send(struct mbuf *m); 164 165#define HASHSIZE 16 166#define HASH(num) ((((num) >> 8) ^ ((num) >> 4) ^ (num)) & 0x0f) 167static struct dn_pipe_head pipehash[HASHSIZE]; /* all pipes */ 168static struct dn_flow_set_head flowsethash[HASHSIZE]; /* all flowsets */ 169 170 171#ifdef SYSCTL_NODE 172SYSCTL_NODE(_net_inet_ip, OID_AUTO, dummynet, 173 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Dummynet"); 174SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, hash_size, 175 CTLFLAG_RW | CTLFLAG_LOCKED, &dn_hash_size, 0, "Default hash table size"); 176SYSCTL_QUAD(_net_inet_ip_dummynet, OID_AUTO, curr_time, 177 CTLFLAG_RD | CTLFLAG_LOCKED, &curr_time, "Current tick"); 178SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, ready_heap, 179 CTLFLAG_RD | CTLFLAG_LOCKED, &ready_heap.size, 0, "Size of ready heap"); 180SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, extract_heap, 181 CTLFLAG_RD | CTLFLAG_LOCKED, &extract_heap.size, 0, "Size of extract heap"); 182SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, searches, 183 CTLFLAG_RD | CTLFLAG_LOCKED, &searches, 0, "Number of queue searches"); 184SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, search_steps, 185 CTLFLAG_RD | CTLFLAG_LOCKED, &search_steps, 0, "Number of queue search steps"); 186SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, expire, 187 CTLFLAG_RW | CTLFLAG_LOCKED, &pipe_expire, 0, "Expire queue if empty"); 188SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, max_chain_len, 189 CTLFLAG_RW | CTLFLAG_LOCKED, &dn_max_ratio, 0, 190 "Max ratio between dynamic queues and buckets"); 191SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_lookup_depth, 192 CTLFLAG_RD | CTLFLAG_LOCKED, &red_lookup_depth, 0, "Depth of RED lookup table"); 193SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_avg_pkt_size, 194 CTLFLAG_RD | CTLFLAG_LOCKED, &red_avg_pkt_size, 0, "RED Medium packet size"); 195SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, red_max_pkt_size, 196 CTLFLAG_RD | CTLFLAG_LOCKED, &red_max_pkt_size, 0, "RED Max packet size"); 197#endif 198 199#ifdef DUMMYNET_DEBUG 200int dummynet_debug = 0; 201#ifdef SYSCTL_NODE 202SYSCTL_INT(_net_inet_ip_dummynet, OID_AUTO, debug, CTLFLAG_RW | CTLFLAG_LOCKED, &dummynet_debug, 203 0, "control debugging printfs"); 204#endif 205#define DPRINTF(X) if (dummynet_debug) printf X 206#else 207#define DPRINTF(X) 208#endif 209 210/* contrary to the comment above random(), it does not actually 211 * return a value [0, 2^31 - 1], which breaks plr amongst other 212 * things. Masking it should work even if the behavior of 213 * the function is fixed. 214 */ 215#define MY_RANDOM (random() & 0x7FFFFFFF) 216 217/* dummynet lock */ 218static lck_grp_t *dn_mutex_grp; 219static lck_grp_attr_t *dn_mutex_grp_attr; 220static lck_attr_t *dn_mutex_attr; 221decl_lck_mtx_data(static, dn_mutex_data); 222static lck_mtx_t *dn_mutex = &dn_mutex_data; 223 224static int config_pipe(struct dn_pipe *p); 225static int ip_dn_ctl(struct sockopt *sopt); 226 227static void dummynet(void *); 228static void dummynet_flush(void); 229void dummynet_drain(void); 230static ip_dn_io_t dummynet_io; 231 232int if_tx_rdy(struct ifnet *ifp); 233 234static void cp_flow_set_to_64_user(struct dn_flow_set *set, struct dn_flow_set_64 *fs_bp); 235static void cp_queue_to_64_user( struct dn_flow_queue *q, struct dn_flow_queue_64 *qp); 236static char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp); 237static char* dn_copy_set_64(struct dn_flow_set *set, char *bp); 238static int cp_pipe_from_user_64( struct sockopt *sopt, struct dn_pipe *p ); 239 240static void cp_flow_set_to_32_user(struct dn_flow_set *set, struct dn_flow_set_32 *fs_bp); 241static void cp_queue_to_32_user( struct dn_flow_queue *q, struct dn_flow_queue_32 *qp); 242static char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp); 243static char* dn_copy_set_32(struct dn_flow_set *set, char *bp); 244static int cp_pipe_from_user_32( struct sockopt *sopt, struct dn_pipe *p ); 245 246 247/* 248 * Heap management functions. 249 * 250 * In the heap, first node is element 0. Children of i are 2i+1 and 2i+2. 251 * Some macros help finding parent/children so we can optimize them. 252 * 253 * heap_init() is called to expand the heap when needed. 254 * Increment size in blocks of 16 entries. 255 * XXX failure to allocate a new element is a pretty bad failure 256 * as we basically stall a whole queue forever!! 257 * Returns 1 on error, 0 on success 258 */ 259#define HEAP_FATHER(x) ( ( (x) - 1 ) / 2 ) 260#define HEAP_LEFT(x) ( 2*(x) + 1 ) 261#define HEAP_IS_LEFT(x) ( (x) & 1 ) 262#define HEAP_RIGHT(x) ( 2*(x) + 2 ) 263#define HEAP_SWAP(a, b, buffer) { buffer = a ; a = b ; b = buffer ; } 264#define HEAP_INCREMENT 15 265 266 267int cp_pipe_from_user_32( struct sockopt *sopt, struct dn_pipe *p ) 268{ 269 struct dn_pipe_32 user_pipe_32; 270 int error=0; 271 272 error = sooptcopyin(sopt, &user_pipe_32, sizeof(struct dn_pipe_32), sizeof(struct dn_pipe_32)); 273 if ( !error ){ 274 p->pipe_nr = user_pipe_32.pipe_nr; 275 p->bandwidth = user_pipe_32.bandwidth; 276 p->delay = user_pipe_32.delay; 277 p->V = user_pipe_32.V; 278 p->sum = user_pipe_32.sum; 279 p->numbytes = user_pipe_32.numbytes; 280 p->sched_time = user_pipe_32.sched_time; 281 bcopy( user_pipe_32.if_name, p->if_name, IFNAMSIZ); 282 p->ready = user_pipe_32.ready; 283 284 p->fs.fs_nr = user_pipe_32.fs.fs_nr; 285 p->fs.flags_fs = user_pipe_32.fs.flags_fs; 286 p->fs.parent_nr = user_pipe_32.fs.parent_nr; 287 p->fs.weight = user_pipe_32.fs.weight; 288 p->fs.qsize = user_pipe_32.fs.qsize; 289 p->fs.plr = user_pipe_32.fs.plr; 290 p->fs.flow_mask = user_pipe_32.fs.flow_mask; 291 p->fs.rq_size = user_pipe_32.fs.rq_size; 292 p->fs.rq_elements = user_pipe_32.fs.rq_elements; 293 p->fs.last_expired = user_pipe_32.fs.last_expired; 294 p->fs.backlogged = user_pipe_32.fs.backlogged; 295 p->fs.w_q = user_pipe_32.fs.w_q; 296 p->fs.max_th = user_pipe_32.fs.max_th; 297 p->fs.min_th = user_pipe_32.fs.min_th; 298 p->fs.max_p = user_pipe_32.fs.max_p; 299 p->fs.c_1 = user_pipe_32.fs.c_1; 300 p->fs.c_2 = user_pipe_32.fs.c_2; 301 p->fs.c_3 = user_pipe_32.fs.c_3; 302 p->fs.c_4 = user_pipe_32.fs.c_4; 303 p->fs.lookup_depth = user_pipe_32.fs.lookup_depth; 304 p->fs.lookup_step = user_pipe_32.fs.lookup_step; 305 p->fs.lookup_weight = user_pipe_32.fs.lookup_weight; 306 p->fs.avg_pkt_size = user_pipe_32.fs.avg_pkt_size; 307 p->fs.max_pkt_size = user_pipe_32.fs.max_pkt_size; 308 } 309 return error; 310} 311 312 313int cp_pipe_from_user_64( struct sockopt *sopt, struct dn_pipe *p ) 314{ 315 struct dn_pipe_64 user_pipe_64; 316 int error=0; 317 318 error = sooptcopyin(sopt, &user_pipe_64, sizeof(struct dn_pipe_64), sizeof(struct dn_pipe_64)); 319 if ( !error ){ 320 p->pipe_nr = user_pipe_64.pipe_nr; 321 p->bandwidth = user_pipe_64.bandwidth; 322 p->delay = user_pipe_64.delay; 323 p->V = user_pipe_64.V; 324 p->sum = user_pipe_64.sum; 325 p->numbytes = user_pipe_64.numbytes; 326 p->sched_time = user_pipe_64.sched_time; 327 bcopy( user_pipe_64.if_name, p->if_name, IFNAMSIZ); 328 p->ready = user_pipe_64.ready; 329 330 p->fs.fs_nr = user_pipe_64.fs.fs_nr; 331 p->fs.flags_fs = user_pipe_64.fs.flags_fs; 332 p->fs.parent_nr = user_pipe_64.fs.parent_nr; 333 p->fs.weight = user_pipe_64.fs.weight; 334 p->fs.qsize = user_pipe_64.fs.qsize; 335 p->fs.plr = user_pipe_64.fs.plr; 336 p->fs.flow_mask = user_pipe_64.fs.flow_mask; 337 p->fs.rq_size = user_pipe_64.fs.rq_size; 338 p->fs.rq_elements = user_pipe_64.fs.rq_elements; 339 p->fs.last_expired = user_pipe_64.fs.last_expired; 340 p->fs.backlogged = user_pipe_64.fs.backlogged; 341 p->fs.w_q = user_pipe_64.fs.w_q; 342 p->fs.max_th = user_pipe_64.fs.max_th; 343 p->fs.min_th = user_pipe_64.fs.min_th; 344 p->fs.max_p = user_pipe_64.fs.max_p; 345 p->fs.c_1 = user_pipe_64.fs.c_1; 346 p->fs.c_2 = user_pipe_64.fs.c_2; 347 p->fs.c_3 = user_pipe_64.fs.c_3; 348 p->fs.c_4 = user_pipe_64.fs.c_4; 349 p->fs.lookup_depth = user_pipe_64.fs.lookup_depth; 350 p->fs.lookup_step = user_pipe_64.fs.lookup_step; 351 p->fs.lookup_weight = user_pipe_64.fs.lookup_weight; 352 p->fs.avg_pkt_size = user_pipe_64.fs.avg_pkt_size; 353 p->fs.max_pkt_size = user_pipe_64.fs.max_pkt_size; 354 } 355 return error; 356} 357 358static void 359cp_flow_set_to_32_user(struct dn_flow_set *set, struct dn_flow_set_32 *fs_bp) 360{ 361 fs_bp->fs_nr = set->fs_nr; 362 fs_bp->flags_fs = set->flags_fs ; 363 fs_bp->parent_nr = set->parent_nr ; 364 fs_bp->weight = set->weight ; 365 fs_bp->qsize = set->qsize ; 366 fs_bp->plr = set->plr ; 367 fs_bp->flow_mask = set->flow_mask ; 368 fs_bp->rq_size = set->rq_size ; 369 fs_bp->rq_elements = set->rq_elements ; 370 fs_bp->last_expired = set->last_expired ; 371 fs_bp->backlogged = set->backlogged ; 372 fs_bp->w_q = set->w_q ; 373 fs_bp->max_th = set->max_th ; 374 fs_bp->min_th = set->min_th ; 375 fs_bp->max_p = set->max_p ; 376 fs_bp->c_1 = set->c_1 ; 377 fs_bp->c_2 = set->c_2 ; 378 fs_bp->c_3 = set->c_3 ; 379 fs_bp->c_4 = set->c_4 ; 380 fs_bp->w_q_lookup = CAST_DOWN_EXPLICIT(user32_addr_t, set->w_q_lookup) ; 381 fs_bp->lookup_depth = set->lookup_depth ; 382 fs_bp->lookup_step = set->lookup_step ; 383 fs_bp->lookup_weight = set->lookup_weight ; 384 fs_bp->avg_pkt_size = set->avg_pkt_size ; 385 fs_bp->max_pkt_size = set->max_pkt_size ; 386} 387 388static void 389cp_flow_set_to_64_user(struct dn_flow_set *set, struct dn_flow_set_64 *fs_bp) 390{ 391 fs_bp->fs_nr = set->fs_nr; 392 fs_bp->flags_fs = set->flags_fs ; 393 fs_bp->parent_nr = set->parent_nr ; 394 fs_bp->weight = set->weight ; 395 fs_bp->qsize = set->qsize ; 396 fs_bp->plr = set->plr ; 397 fs_bp->flow_mask = set->flow_mask ; 398 fs_bp->rq_size = set->rq_size ; 399 fs_bp->rq_elements = set->rq_elements ; 400 fs_bp->last_expired = set->last_expired ; 401 fs_bp->backlogged = set->backlogged ; 402 fs_bp->w_q = set->w_q ; 403 fs_bp->max_th = set->max_th ; 404 fs_bp->min_th = set->min_th ; 405 fs_bp->max_p = set->max_p ; 406 fs_bp->c_1 = set->c_1 ; 407 fs_bp->c_2 = set->c_2 ; 408 fs_bp->c_3 = set->c_3 ; 409 fs_bp->c_4 = set->c_4 ; 410 fs_bp->w_q_lookup = CAST_DOWN(user64_addr_t, set->w_q_lookup) ; 411 fs_bp->lookup_depth = set->lookup_depth ; 412 fs_bp->lookup_step = set->lookup_step ; 413 fs_bp->lookup_weight = set->lookup_weight ; 414 fs_bp->avg_pkt_size = set->avg_pkt_size ; 415 fs_bp->max_pkt_size = set->max_pkt_size ; 416} 417 418static 419void cp_queue_to_32_user( struct dn_flow_queue *q, struct dn_flow_queue_32 *qp) 420{ 421 qp->id = q->id; 422 qp->len = q->len; 423 qp->len_bytes = q->len_bytes; 424 qp->numbytes = q->numbytes; 425 qp->tot_pkts = q->tot_pkts; 426 qp->tot_bytes = q->tot_bytes; 427 qp->drops = q->drops; 428 qp->hash_slot = q->hash_slot; 429 qp->avg = q->avg; 430 qp->count = q->count; 431 qp->random = q->random; 432 qp->q_time = q->q_time; 433 qp->heap_pos = q->heap_pos; 434 qp->sched_time = q->sched_time; 435 qp->S = q->S; 436 qp->F = q->F; 437} 438 439static 440void cp_queue_to_64_user( struct dn_flow_queue *q, struct dn_flow_queue_64 *qp) 441{ 442 qp->id = q->id; 443 qp->len = q->len; 444 qp->len_bytes = q->len_bytes; 445 qp->numbytes = q->numbytes; 446 qp->tot_pkts = q->tot_pkts; 447 qp->tot_bytes = q->tot_bytes; 448 qp->drops = q->drops; 449 qp->hash_slot = q->hash_slot; 450 qp->avg = q->avg; 451 qp->count = q->count; 452 qp->random = q->random; 453 qp->q_time = q->q_time; 454 qp->heap_pos = q->heap_pos; 455 qp->sched_time = q->sched_time; 456 qp->S = q->S; 457 qp->F = q->F; 458} 459 460static 461char *cp_pipe_to_32_user(struct dn_pipe *p, struct dn_pipe_32 *pipe_bp) 462{ 463 char *bp; 464 465 pipe_bp->pipe_nr = p->pipe_nr; 466 pipe_bp->bandwidth = p->bandwidth; 467 pipe_bp->delay = p->delay; 468 bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_32)); 469 pipe_bp->scheduler_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->scheduler_heap.p); 470 bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_32)); 471 pipe_bp->not_eligible_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->not_eligible_heap.p); 472 bcopy( &(p->idle_heap), &(pipe_bp->idle_heap), sizeof(struct dn_heap_32)); 473 pipe_bp->idle_heap.p = CAST_DOWN_EXPLICIT(user32_addr_t, pipe_bp->idle_heap.p); 474 pipe_bp->V = p->V; 475 pipe_bp->sum = p->sum; 476 pipe_bp->numbytes = p->numbytes; 477 pipe_bp->sched_time = p->sched_time; 478 bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ); 479 pipe_bp->ifp = CAST_DOWN_EXPLICIT(user32_addr_t, p->ifp); 480 pipe_bp->ready = p->ready; 481 482 cp_flow_set_to_32_user( &(p->fs), &(pipe_bp->fs)); 483 484 pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; 485 /* 486 * XXX the following is a hack based on ->next being the 487 * first field in dn_pipe and dn_flow_set. The correct 488 * solution would be to move the dn_flow_set to the beginning 489 * of struct dn_pipe. 490 */ 491 pipe_bp->next = CAST_DOWN_EXPLICIT( user32_addr_t, DN_IS_PIPE ); 492 /* clean pointers */ 493 pipe_bp->head = pipe_bp->tail = (user32_addr_t) 0 ; 494 pipe_bp->fs.next = (user32_addr_t)0 ; 495 pipe_bp->fs.pipe = (user32_addr_t)0 ; 496 pipe_bp->fs.rq = (user32_addr_t)0 ; 497 bp = ((char *)pipe_bp) + sizeof(struct dn_pipe_32); 498 return( dn_copy_set_32( &(p->fs), bp) ); 499} 500 501static 502char *cp_pipe_to_64_user(struct dn_pipe *p, struct dn_pipe_64 *pipe_bp) 503{ 504 char *bp; 505 506 pipe_bp->pipe_nr = p->pipe_nr; 507 pipe_bp->bandwidth = p->bandwidth; 508 pipe_bp->delay = p->delay; 509 bcopy( &(p->scheduler_heap), &(pipe_bp->scheduler_heap), sizeof(struct dn_heap_64)); 510 pipe_bp->scheduler_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->scheduler_heap.p); 511 bcopy( &(p->not_eligible_heap), &(pipe_bp->not_eligible_heap), sizeof(struct dn_heap_64)); 512 pipe_bp->not_eligible_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->not_eligible_heap.p); 513 bcopy( &(p->idle_heap), &(pipe_bp->idle_heap), sizeof(struct dn_heap_64)); 514 pipe_bp->idle_heap.p = CAST_DOWN(user64_addr_t, pipe_bp->idle_heap.p); 515 pipe_bp->V = p->V; 516 pipe_bp->sum = p->sum; 517 pipe_bp->numbytes = p->numbytes; 518 pipe_bp->sched_time = p->sched_time; 519 bcopy( p->if_name, pipe_bp->if_name, IFNAMSIZ); 520 pipe_bp->ifp = CAST_DOWN(user64_addr_t, p->ifp); 521 pipe_bp->ready = p->ready; 522 523 cp_flow_set_to_64_user( &(p->fs), &(pipe_bp->fs)); 524 525 pipe_bp->delay = (pipe_bp->delay * 1000) / (hz*10) ; 526 /* 527 * XXX the following is a hack based on ->next being the 528 * first field in dn_pipe and dn_flow_set. The correct 529 * solution would be to move the dn_flow_set to the beginning 530 * of struct dn_pipe. 531 */ 532 pipe_bp->next = CAST_DOWN( user64_addr_t, DN_IS_PIPE ); 533 /* clean pointers */ 534 pipe_bp->head = pipe_bp->tail = USER_ADDR_NULL ; 535 pipe_bp->fs.next = USER_ADDR_NULL ; 536 pipe_bp->fs.pipe = USER_ADDR_NULL ; 537 pipe_bp->fs.rq = USER_ADDR_NULL ; 538 bp = ((char *)pipe_bp) + sizeof(struct dn_pipe_64); 539 return( dn_copy_set_64( &(p->fs), bp) ); 540} 541 542static int 543heap_init(struct dn_heap *h, int new_size) 544{ 545 struct dn_heap_entry *p; 546 547 if (h->size >= new_size ) { 548 printf("dummynet: heap_init, Bogus call, have %d want %d\n", 549 h->size, new_size); 550 return 0 ; 551 } 552 new_size = (new_size + HEAP_INCREMENT ) & ~HEAP_INCREMENT ; 553 p = _MALLOC(new_size * sizeof(*p), M_DUMMYNET, M_DONTWAIT ); 554 if (p == NULL) { 555 printf("dummynet: heap_init, resize %d failed\n", new_size ); 556 return 1 ; /* error */ 557 } 558 if (h->size > 0) { 559 bcopy(h->p, p, h->size * sizeof(*p) ); 560 FREE(h->p, M_DUMMYNET); 561 } 562 h->p = p ; 563 h->size = new_size ; 564 return 0 ; 565} 566 567/* 568 * Insert element in heap. Normally, p != NULL, we insert p in 569 * a new position and bubble up. If p == NULL, then the element is 570 * already in place, and key is the position where to start the 571 * bubble-up. 572 * Returns 1 on failure (cannot allocate new heap entry) 573 * 574 * If offset > 0 the position (index, int) of the element in the heap is 575 * also stored in the element itself at the given offset in bytes. 576 */ 577#define SET_OFFSET(heap, node) \ 578 if (heap->offset > 0) \ 579 *((int *)((char *)(heap->p[node].object) + heap->offset)) = node ; 580/* 581 * RESET_OFFSET is used for sanity checks. It sets offset to an invalid value. 582 */ 583#define RESET_OFFSET(heap, node) \ 584 if (heap->offset > 0) \ 585 *((int *)((char *)(heap->p[node].object) + heap->offset)) = -1 ; 586static int 587heap_insert(struct dn_heap *h, dn_key key1, void *p) 588{ 589 int son = h->elements ; 590 591 if (p == NULL) /* data already there, set starting point */ 592 son = key1 ; 593 else { /* insert new element at the end, possibly resize */ 594 son = h->elements ; 595 if (son == h->size) /* need resize... */ 596 if (heap_init(h, h->elements+1) ) 597 return 1 ; /* failure... */ 598 h->p[son].object = p ; 599 h->p[son].key = key1 ; 600 h->elements++ ; 601 } 602 while (son > 0) { /* bubble up */ 603 int father = HEAP_FATHER(son) ; 604 struct dn_heap_entry tmp ; 605 606 if (DN_KEY_LT( h->p[father].key, h->p[son].key ) ) 607 break ; /* found right position */ 608 /* son smaller than father, swap and repeat */ 609 HEAP_SWAP(h->p[son], h->p[father], tmp) ; 610 SET_OFFSET(h, son); 611 son = father ; 612 } 613 SET_OFFSET(h, son); 614 return 0 ; 615} 616 617/* 618 * remove top element from heap, or obj if obj != NULL 619 */ 620static void 621heap_extract(struct dn_heap *h, void *obj) 622{ 623 int child, father, maxelt = h->elements - 1 ; 624 625 if (maxelt < 0) { 626 printf("dummynet: warning, extract from empty heap 0x%llx\n", 627 (uint64_t)VM_KERNEL_ADDRPERM(h)); 628 return ; 629 } 630 father = 0 ; /* default: move up smallest child */ 631 if (obj != NULL) { /* extract specific element, index is at offset */ 632 if (h->offset <= 0) 633 panic("dummynet: heap_extract from middle not supported on this heap!!!\n"); 634 father = *((int *)((char *)obj + h->offset)) ; 635 if (father < 0 || father >= h->elements) { 636 printf("dummynet: heap_extract, father %d out of bound 0..%d\n", 637 father, h->elements); 638 panic("dummynet: heap_extract"); 639 } 640 } 641 RESET_OFFSET(h, father); 642 child = HEAP_LEFT(father) ; /* left child */ 643 while (child <= maxelt) { /* valid entry */ 644 if (child != maxelt && DN_KEY_LT(h->p[child+1].key, h->p[child].key) ) 645 child = child+1 ; /* take right child, otherwise left */ 646 h->p[father] = h->p[child] ; 647 SET_OFFSET(h, father); 648 father = child ; 649 child = HEAP_LEFT(child) ; /* left child for next loop */ 650 } 651 h->elements-- ; 652 if (father != maxelt) { 653 /* 654 * Fill hole with last entry and bubble up, reusing the insert code 655 */ 656 h->p[father] = h->p[maxelt] ; 657 heap_insert(h, father, NULL); /* this one cannot fail */ 658 } 659} 660 661/* 662 * heapify() will reorganize data inside an array to maintain the 663 * heap property. It is needed when we delete a bunch of entries. 664 */ 665static void 666heapify(struct dn_heap *h) 667{ 668 int i ; 669 670 for (i = 0 ; i < h->elements ; i++ ) 671 heap_insert(h, i , NULL) ; 672} 673 674/* 675 * cleanup the heap and free data structure 676 */ 677static void 678heap_free(struct dn_heap *h) 679{ 680 if (h->size >0 ) 681 FREE(h->p, M_DUMMYNET); 682 bzero(h, sizeof(*h)); 683} 684 685/* 686 * --- end of heap management functions --- 687 */ 688 689/* 690 * Return the mbuf tag holding the dummynet state. As an optimization 691 * this is assumed to be the first tag on the list. If this turns out 692 * wrong we'll need to search the list. 693 */ 694static struct dn_pkt_tag * 695dn_tag_get(struct mbuf *m) 696{ 697 struct m_tag *mtag = m_tag_first(m); 698 699 if (!(mtag != NULL && 700 mtag->m_tag_id == KERNEL_MODULE_TAG_ID && 701 mtag->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET)) 702 panic("packet on dummynet queue w/o dummynet tag: 0x%llx", 703 (uint64_t)VM_KERNEL_ADDRPERM(m)); 704 705 return (struct dn_pkt_tag *)(mtag+1); 706} 707 708/* 709 * Scheduler functions: 710 * 711 * transmit_event() is called when the delay-line needs to enter 712 * the scheduler, either because of existing pkts getting ready, 713 * or new packets entering the queue. The event handled is the delivery 714 * time of the packet. 715 * 716 * ready_event() does something similar with fixed-rate queues, and the 717 * event handled is the finish time of the head pkt. 718 * 719 * wfq_ready_event() does something similar with WF2Q queues, and the 720 * event handled is the start time of the head pkt. 721 * 722 * In all cases, we make sure that the data structures are consistent 723 * before passing pkts out, because this might trigger recursive 724 * invocations of the procedures. 725 */ 726static void 727transmit_event(struct dn_pipe *pipe, struct mbuf **head, struct mbuf **tail) 728{ 729 struct mbuf *m ; 730 struct dn_pkt_tag *pkt = NULL; 731 u_int64_t schedule_time; 732 733 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 734 ASSERT(serialize >= 0); 735 if (serialize == 0) { 736 while ((m = pipe->head) != NULL) { 737 pkt = dn_tag_get(m); 738 if (!DN_KEY_LEQ(pkt->dn_output_time, curr_time)) 739 break; 740 741 pipe->head = m->m_nextpkt; 742 if (*tail != NULL) 743 (*tail)->m_nextpkt = m; 744 else 745 *head = m; 746 *tail = m; 747 } 748 749 if (*tail != NULL) 750 (*tail)->m_nextpkt = NULL; 751 } 752 753 schedule_time = pkt == NULL || DN_KEY_LEQ(pkt->dn_output_time, curr_time) ? 754 curr_time + 1 : pkt->dn_output_time; 755 756 /* if there are leftover packets, put the pipe into the heap for next ready event */ 757 if ((m = pipe->head) != NULL) { 758 pkt = dn_tag_get(m); 759 /* XXX should check errors on heap_insert, by draining the 760 * whole pipe p and hoping in the future we are more successful 761 */ 762 heap_insert(&extract_heap, schedule_time, pipe); 763 } 764} 765 766/* 767 * the following macro computes how many ticks we have to wait 768 * before being able to transmit a packet. The credit is taken from 769 * either a pipe (WF2Q) or a flow_queue (per-flow queueing) 770 */ 771 772/* hz is 100, which gives a granularity of 10ms in the old timer. 773 * The timer has been changed to fire every 1ms, so the use of 774 * hz has been modified here. All instances of hz have been left 775 * in place but adjusted by a factor of 10 so that hz is functionally 776 * equal to 1000. 777 */ 778#define SET_TICKS(_m, q, p) \ 779 ((_m)->m_pkthdr.len*8*(hz*10) - (q)->numbytes + p->bandwidth - 1 ) / \ 780 p->bandwidth ; 781 782/* 783 * extract pkt from queue, compute output time (could be now) 784 * and put into delay line (p_queue) 785 */ 786static void 787move_pkt(struct mbuf *pkt, struct dn_flow_queue *q, 788 struct dn_pipe *p, int len) 789{ 790 struct dn_pkt_tag *dt = dn_tag_get(pkt); 791 792 q->head = pkt->m_nextpkt ; 793 q->len-- ; 794 q->len_bytes -= len ; 795 796 dt->dn_output_time = curr_time + p->delay ; 797 798 if (p->head == NULL) 799 p->head = pkt; 800 else 801 p->tail->m_nextpkt = pkt; 802 p->tail = pkt; 803 p->tail->m_nextpkt = NULL; 804} 805 806/* 807 * ready_event() is invoked every time the queue must enter the 808 * scheduler, either because the first packet arrives, or because 809 * a previously scheduled event fired. 810 * On invokation, drain as many pkts as possible (could be 0) and then 811 * if there are leftover packets reinsert the pkt in the scheduler. 812 */ 813static void 814ready_event(struct dn_flow_queue *q, struct mbuf **head, struct mbuf **tail) 815{ 816 struct mbuf *pkt; 817 struct dn_pipe *p = q->fs->pipe ; 818 int p_was_empty ; 819 820 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 821 822 if (p == NULL) { 823 printf("dummynet: ready_event pipe is gone\n"); 824 return ; 825 } 826 p_was_empty = (p->head == NULL) ; 827 828 /* 829 * schedule fixed-rate queues linked to this pipe: 830 * Account for the bw accumulated since last scheduling, then 831 * drain as many pkts as allowed by q->numbytes and move to 832 * the delay line (in p) computing output time. 833 * bandwidth==0 (no limit) means we can drain the whole queue, 834 * setting len_scaled = 0 does the job. 835 */ 836 q->numbytes += ( curr_time - q->sched_time ) * p->bandwidth; 837 while ( (pkt = q->head) != NULL ) { 838 int len = pkt->m_pkthdr.len; 839 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ; 840 if (len_scaled > q->numbytes ) 841 break ; 842 q->numbytes -= len_scaled ; 843 move_pkt(pkt, q, p, len); 844 } 845 /* 846 * If we have more packets queued, schedule next ready event 847 * (can only occur when bandwidth != 0, otherwise we would have 848 * flushed the whole queue in the previous loop). 849 * To this purpose we record the current time and compute how many 850 * ticks to go for the finish time of the packet. 851 */ 852 if ( (pkt = q->head) != NULL ) { /* this implies bandwidth != 0 */ 853 dn_key t = SET_TICKS(pkt, q, p); /* ticks i have to wait */ 854 q->sched_time = curr_time ; 855 heap_insert(&ready_heap, curr_time + t, (void *)q ); 856 /* XXX should check errors on heap_insert, and drain the whole 857 * queue on error hoping next time we are luckier. 858 */ 859 } else { /* RED needs to know when the queue becomes empty */ 860 q->q_time = curr_time; 861 q->numbytes = 0; 862 } 863 /* 864 * If the delay line was empty call transmit_event(p) now. 865 * Otherwise, the scheduler will take care of it. 866 */ 867 if (p_was_empty) 868 transmit_event(p, head, tail); 869} 870 871/* 872 * Called when we can transmit packets on WF2Q queues. Take pkts out of 873 * the queues at their start time, and enqueue into the delay line. 874 * Packets are drained until p->numbytes < 0. As long as 875 * len_scaled >= p->numbytes, the packet goes into the delay line 876 * with a deadline p->delay. For the last packet, if p->numbytes<0, 877 * there is an additional delay. 878 */ 879static void 880ready_event_wfq(struct dn_pipe *p, struct mbuf **head, struct mbuf **tail) 881{ 882 int p_was_empty = (p->head == NULL) ; 883 struct dn_heap *sch = &(p->scheduler_heap); 884 struct dn_heap *neh = &(p->not_eligible_heap) ; 885 int64_t p_numbytes = p->numbytes; 886 887 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 888 889 if (p->if_name[0] == 0) /* tx clock is simulated */ 890 p_numbytes += ( curr_time - p->sched_time ) * p->bandwidth; 891 else { /* tx clock is for real, the ifq must be empty or this is a NOP */ 892 if (p->ifp && !IFCQ_IS_EMPTY(&p->ifp->if_snd)) 893 return ; 894 else { 895 DPRINTF(("dummynet: pipe %d ready from %s --\n", 896 p->pipe_nr, p->if_name)); 897 } 898 } 899 900 /* 901 * While we have backlogged traffic AND credit, we need to do 902 * something on the queue. 903 */ 904 while ( p_numbytes >=0 && (sch->elements>0 || neh->elements >0) ) { 905 if (sch->elements > 0) { /* have some eligible pkts to send out */ 906 struct dn_flow_queue *q = sch->p[0].object ; 907 struct mbuf *pkt = q->head; 908 struct dn_flow_set *fs = q->fs; 909 u_int64_t len = pkt->m_pkthdr.len; 910 int len_scaled = p->bandwidth ? len*8*(hz*10) : 0 ; 911 912 heap_extract(sch, NULL); /* remove queue from heap */ 913 p_numbytes -= len_scaled ; 914 move_pkt(pkt, q, p, len); 915 916 p->V += (len<<MY_M) / p->sum ; /* update V */ 917 q->S = q->F ; /* update start time */ 918 if (q->len == 0) { /* Flow not backlogged any more */ 919 fs->backlogged-- ; 920 heap_insert(&(p->idle_heap), q->F, q); 921 } else { /* still backlogged */ 922 /* 923 * update F and position in backlogged queue, then 924 * put flow in not_eligible_heap (we will fix this later). 925 */ 926 len = (q->head)->m_pkthdr.len; 927 q->F += (len<<MY_M)/(u_int64_t) fs->weight ; 928 if (DN_KEY_LEQ(q->S, p->V)) 929 heap_insert(neh, q->S, q); 930 else 931 heap_insert(sch, q->F, q); 932 } 933 } 934 /* 935 * now compute V = max(V, min(S_i)). Remember that all elements in sch 936 * have by definition S_i <= V so if sch is not empty, V is surely 937 * the max and we must not update it. Conversely, if sch is empty 938 * we only need to look at neh. 939 */ 940 if (sch->elements == 0 && neh->elements > 0) 941 p->V = MAX64 ( p->V, neh->p[0].key ); 942 /* move from neh to sch any packets that have become eligible */ 943 while (neh->elements > 0 && DN_KEY_LEQ(neh->p[0].key, p->V) ) { 944 struct dn_flow_queue *q = neh->p[0].object ; 945 heap_extract(neh, NULL); 946 heap_insert(sch, q->F, q); 947 } 948 949 if (p->if_name[0] != '\0') {/* tx clock is from a real thing */ 950 p_numbytes = -1 ; /* mark not ready for I/O */ 951 break ; 952 } 953 } 954 if (sch->elements == 0 && neh->elements == 0 && p_numbytes >= 0 955 && p->idle_heap.elements > 0) { 956 /* 957 * no traffic and no events scheduled. We can get rid of idle-heap. 958 */ 959 int i ; 960 961 for (i = 0 ; i < p->idle_heap.elements ; i++) { 962 struct dn_flow_queue *q = p->idle_heap.p[i].object ; 963 964 q->F = 0 ; 965 q->S = q->F + 1 ; 966 } 967 p->sum = 0 ; 968 p->V = 0 ; 969 p->idle_heap.elements = 0 ; 970 } 971 /* 972 * If we are getting clocks from dummynet (not a real interface) and 973 * If we are under credit, schedule the next ready event. 974 * Also fix the delivery time of the last packet. 975 */ 976 if (p->if_name[0]==0 && p_numbytes < 0) { /* this implies bandwidth >0 */ 977 dn_key t=0 ; /* number of ticks i have to wait */ 978 979 if (p->bandwidth > 0) 980 t = ( p->bandwidth -1 - p_numbytes) / p->bandwidth ; 981 dn_tag_get(p->tail)->dn_output_time += t ; 982 p->sched_time = curr_time ; 983 heap_insert(&wfq_ready_heap, curr_time + t, (void *)p); 984 /* XXX should check errors on heap_insert, and drain the whole 985 * queue on error hoping next time we are luckier. 986 */ 987 } 988 989 /* Fit (adjust if necessary) 64bit result into 32bit variable. */ 990 if (p_numbytes > INT_MAX) 991 p->numbytes = INT_MAX; 992 else if (p_numbytes < INT_MIN) 993 p->numbytes = INT_MIN; 994 else 995 p->numbytes = p_numbytes; 996 997 /* 998 * If the delay line was empty call transmit_event(p) now. 999 * Otherwise, the scheduler will take care of it. 1000 */ 1001 if (p_was_empty) 1002 transmit_event(p, head, tail); 1003 1004} 1005 1006/* 1007 * This is called every 1ms. It is used to 1008 * increment the current tick counter and schedule expired events. 1009 */ 1010static void 1011dummynet(__unused void * unused) 1012{ 1013 void *p ; /* generic parameter to handler */ 1014 struct dn_heap *h ; 1015 struct dn_heap *heaps[3]; 1016 struct mbuf *head = NULL, *tail = NULL; 1017 int i; 1018 struct dn_pipe *pe ; 1019 struct timespec ts; 1020 struct timeval tv; 1021 1022 heaps[0] = &ready_heap ; /* fixed-rate queues */ 1023 heaps[1] = &wfq_ready_heap ; /* wfq queues */ 1024 heaps[2] = &extract_heap ; /* delay line */ 1025 1026 lck_mtx_lock(dn_mutex); 1027 1028 /* make all time measurements in milliseconds (ms) - 1029 * here we convert secs and usecs to msecs (just divide the 1030 * usecs and take the closest whole number). 1031 */ 1032 microuptime(&tv); 1033 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 1034 1035 for (i=0; i < 3 ; i++) { 1036 h = heaps[i]; 1037 while (h->elements > 0 && DN_KEY_LEQ(h->p[0].key, curr_time) ) { 1038 if (h->p[0].key > curr_time) 1039 printf("dummynet: warning, heap %d is %d ticks late\n", 1040 i, (int)(curr_time - h->p[0].key)); 1041 p = h->p[0].object ; /* store a copy before heap_extract */ 1042 heap_extract(h, NULL); /* need to extract before processing */ 1043 if (i == 0) 1044 ready_event(p, &head, &tail) ; 1045 else if (i == 1) { 1046 struct dn_pipe *pipe = p; 1047 if (pipe->if_name[0] != '\0') 1048 printf("dummynet: bad ready_event_wfq for pipe %s\n", 1049 pipe->if_name); 1050 else 1051 ready_event_wfq(p, &head, &tail) ; 1052 } else { 1053 transmit_event(p, &head, &tail); 1054 } 1055 } 1056 } 1057 /* sweep pipes trying to expire idle flow_queues */ 1058 for (i = 0; i < HASHSIZE; i++) 1059 SLIST_FOREACH(pe, &pipehash[i], next) 1060 if (pe->idle_heap.elements > 0 && 1061 DN_KEY_LT(pe->idle_heap.p[0].key, pe->V) ) { 1062 struct dn_flow_queue *q = pe->idle_heap.p[0].object ; 1063 1064 heap_extract(&(pe->idle_heap), NULL); 1065 q->S = q->F + 1 ; /* mark timestamp as invalid */ 1066 pe->sum -= q->fs->weight ; 1067 } 1068 1069 /* check the heaps to see if there's still stuff in there, and 1070 * only set the timer if there are packets to process 1071 */ 1072 timer_enabled = 0; 1073 for (i=0; i < 3 ; i++) { 1074 h = heaps[i]; 1075 if (h->elements > 0) { // set the timer 1076 ts.tv_sec = 0; 1077 ts.tv_nsec = 1 * 1000000; // 1ms 1078 timer_enabled = 1; 1079 bsd_timeout(dummynet, NULL, &ts); 1080 break; 1081 } 1082 } 1083 1084 if (head != NULL) 1085 serialize++; 1086 1087 lck_mtx_unlock(dn_mutex); 1088 1089 /* Send out the de-queued list of ready-to-send packets */ 1090 if (head != NULL) { 1091 dummynet_send(head); 1092 lck_mtx_lock(dn_mutex); 1093 serialize--; 1094 lck_mtx_unlock(dn_mutex); 1095 } 1096} 1097 1098 1099static void 1100dummynet_send(struct mbuf *m) 1101{ 1102 struct dn_pkt_tag *pkt; 1103 struct mbuf *n; 1104 1105 for (; m != NULL; m = n) { 1106 n = m->m_nextpkt; 1107 m->m_nextpkt = NULL; 1108 pkt = dn_tag_get(m); 1109 1110 DPRINTF(("dummynet_send m: 0x%llx dn_dir: %d dn_flags: 0x%x\n", 1111 (uint64_t)VM_KERNEL_ADDRPERM(m), pkt->dn_dir, 1112 pkt->dn_flags)); 1113 1114 switch (pkt->dn_dir) { 1115 case DN_TO_IP_OUT: { 1116 struct route tmp_rt; 1117 1118 /* route is already in the packet's dn_ro */ 1119 bzero(&tmp_rt, sizeof (tmp_rt)); 1120 1121 /* Force IP_RAWOUTPUT as the IP header is fully formed */ 1122 pkt->dn_flags |= IP_RAWOUTPUT | IP_FORWARDING; 1123 (void)ip_output(m, NULL, &tmp_rt, pkt->dn_flags, NULL, NULL); 1124 ROUTE_RELEASE(&tmp_rt); 1125 break ; 1126 } 1127 case DN_TO_IP_IN : 1128 proto_inject(PF_INET, m); 1129 break ; 1130#ifdef INET6 1131 case DN_TO_IP6_OUT: { 1132 /* routes already in the packet's dn_{ro6,pmtu} */ 1133 ip6_output(m, NULL, NULL, IPV6_FORWARDING, NULL, NULL, NULL); 1134 break; 1135 } 1136 case DN_TO_IP6_IN: 1137 proto_inject(PF_INET6, m); 1138 break; 1139#endif /* INET6 */ 1140 default: 1141 printf("dummynet: bad switch %d!\n", pkt->dn_dir); 1142 m_freem(m); 1143 break ; 1144 } 1145 } 1146} 1147 1148 1149 1150/* 1151 * called by an interface when tx_rdy occurs. 1152 */ 1153int 1154if_tx_rdy(struct ifnet *ifp) 1155{ 1156 struct dn_pipe *p; 1157 struct mbuf *head = NULL, *tail = NULL; 1158 int i; 1159 1160 lck_mtx_lock(dn_mutex); 1161 1162 for (i = 0; i < HASHSIZE; i++) 1163 SLIST_FOREACH(p, &pipehash[i], next) 1164 if (p->ifp == ifp) 1165 break ; 1166 if (p == NULL) { 1167 char buf[32]; 1168 snprintf(buf, sizeof(buf), "%s", if_name(ifp)); 1169 for (i = 0; i < HASHSIZE; i++) 1170 SLIST_FOREACH(p, &pipehash[i], next) 1171 if (!strcmp(p->if_name, buf) ) { 1172 p->ifp = ifp ; 1173 DPRINTF(("dummynet: ++ tx rdy from %s (now found)\n", buf)); 1174 break ; 1175 } 1176 } 1177 if (p != NULL) { 1178 DPRINTF(("dummynet: ++ tx rdy from %s - qlen %d\n", if_name(ifp), 1179 IFCQ_LEN(&ifp->if_snd))); 1180 p->numbytes = 0 ; /* mark ready for I/O */ 1181 ready_event_wfq(p, &head, &tail); 1182 } 1183 1184 if (head != NULL) { 1185 serialize++; 1186 } 1187 1188 lck_mtx_unlock(dn_mutex); 1189 1190 /* Send out the de-queued list of ready-to-send packets */ 1191 if (head != NULL) { 1192 dummynet_send(head); 1193 lck_mtx_lock(dn_mutex); 1194 serialize--; 1195 lck_mtx_unlock(dn_mutex); 1196 } 1197 return 0; 1198} 1199 1200/* 1201 * Unconditionally expire empty queues in case of shortage. 1202 * Returns the number of queues freed. 1203 */ 1204static int 1205expire_queues(struct dn_flow_set *fs) 1206{ 1207 struct dn_flow_queue *q, *prev ; 1208 int i, initial_elements = fs->rq_elements ; 1209 struct timeval timenow; 1210 1211 /* reviewed for getmicrotime usage */ 1212 getmicrotime(&timenow); 1213 1214 if (fs->last_expired == timenow.tv_sec) 1215 return 0 ; 1216 fs->last_expired = timenow.tv_sec ; 1217 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is overflow */ 1218 for (prev=NULL, q = fs->rq[i] ; q != NULL ; ) 1219 if (q->head != NULL || q->S != q->F+1) { 1220 prev = q ; 1221 q = q->next ; 1222 } else { /* entry is idle, expire it */ 1223 struct dn_flow_queue *old_q = q ; 1224 1225 if (prev != NULL) 1226 prev->next = q = q->next ; 1227 else 1228 fs->rq[i] = q = q->next ; 1229 fs->rq_elements-- ; 1230 FREE(old_q, M_DUMMYNET); 1231 } 1232 return initial_elements - fs->rq_elements ; 1233} 1234 1235/* 1236 * If room, create a new queue and put at head of slot i; 1237 * otherwise, create or use the default queue. 1238 */ 1239static struct dn_flow_queue * 1240create_queue(struct dn_flow_set *fs, int i) 1241{ 1242 struct dn_flow_queue *q ; 1243 1244 if (fs->rq_elements > fs->rq_size * dn_max_ratio && 1245 expire_queues(fs) == 0) { 1246 /* 1247 * No way to get room, use or create overflow queue. 1248 */ 1249 i = fs->rq_size ; 1250 if ( fs->rq[i] != NULL ) 1251 return fs->rq[i] ; 1252 } 1253 q = _MALLOC(sizeof(*q), M_DUMMYNET, M_DONTWAIT | M_ZERO); 1254 if (q == NULL) { 1255 printf("dummynet: sorry, cannot allocate queue for new flow\n"); 1256 return NULL ; 1257 } 1258 q->fs = fs ; 1259 q->hash_slot = i ; 1260 q->next = fs->rq[i] ; 1261 q->S = q->F + 1; /* hack - mark timestamp as invalid */ 1262 fs->rq[i] = q ; 1263 fs->rq_elements++ ; 1264 return q ; 1265} 1266 1267/* 1268 * Given a flow_set and a pkt in last_pkt, find a matching queue 1269 * after appropriate masking. The queue is moved to front 1270 * so that further searches take less time. 1271 */ 1272static struct dn_flow_queue * 1273find_queue(struct dn_flow_set *fs, struct ip_flow_id *id) 1274{ 1275 int i = 0 ; /* we need i and q for new allocations */ 1276 struct dn_flow_queue *q, *prev; 1277 int is_v6 = IS_IP6_FLOW_ID(id); 1278 1279 if ( !(fs->flags_fs & DN_HAVE_FLOW_MASK) ) 1280 q = fs->rq[0] ; 1281 else { 1282 /* first, do the masking, then hash */ 1283 id->dst_port &= fs->flow_mask.dst_port ; 1284 id->src_port &= fs->flow_mask.src_port ; 1285 id->proto &= fs->flow_mask.proto ; 1286 id->flags = 0 ; /* we don't care about this one */ 1287 if (is_v6) { 1288 APPLY_MASK(&id->dst_ip6, &fs->flow_mask.dst_ip6); 1289 APPLY_MASK(&id->src_ip6, &fs->flow_mask.src_ip6); 1290 id->flow_id6 &= fs->flow_mask.flow_id6; 1291 1292 i = ((id->dst_ip6.__u6_addr.__u6_addr32[0]) & 0xffff)^ 1293 ((id->dst_ip6.__u6_addr.__u6_addr32[1]) & 0xffff)^ 1294 ((id->dst_ip6.__u6_addr.__u6_addr32[2]) & 0xffff)^ 1295 ((id->dst_ip6.__u6_addr.__u6_addr32[3]) & 0xffff)^ 1296 1297 ((id->dst_ip6.__u6_addr.__u6_addr32[0] >> 15) & 0xffff)^ 1298 ((id->dst_ip6.__u6_addr.__u6_addr32[1] >> 15) & 0xffff)^ 1299 ((id->dst_ip6.__u6_addr.__u6_addr32[2] >> 15) & 0xffff)^ 1300 ((id->dst_ip6.__u6_addr.__u6_addr32[3] >> 15) & 0xffff)^ 1301 1302 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 1) & 0xfffff)^ 1303 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 1) & 0xfffff)^ 1304 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 1) & 0xfffff)^ 1305 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 1) & 0xfffff)^ 1306 1307 ((id->src_ip6.__u6_addr.__u6_addr32[0] << 16) & 0xffff)^ 1308 ((id->src_ip6.__u6_addr.__u6_addr32[1] << 16) & 0xffff)^ 1309 ((id->src_ip6.__u6_addr.__u6_addr32[2] << 16) & 0xffff)^ 1310 ((id->src_ip6.__u6_addr.__u6_addr32[3] << 16) & 0xffff)^ 1311 1312 (id->dst_port << 1) ^ (id->src_port) ^ 1313 (id->proto ) ^ 1314 (id->flow_id6); 1315 } else { 1316 id->dst_ip &= fs->flow_mask.dst_ip ; 1317 id->src_ip &= fs->flow_mask.src_ip ; 1318 1319 i = ( (id->dst_ip) & 0xffff ) ^ 1320 ( (id->dst_ip >> 15) & 0xffff ) ^ 1321 ( (id->src_ip << 1) & 0xffff ) ^ 1322 ( (id->src_ip >> 16 ) & 0xffff ) ^ 1323 (id->dst_port << 1) ^ (id->src_port) ^ 1324 (id->proto ); 1325 } 1326 i = i % fs->rq_size ; 1327 /* finally, scan the current list for a match */ 1328 searches++ ; 1329 for (prev=NULL, q = fs->rq[i] ; q ; ) { 1330 search_steps++; 1331 if (is_v6 && 1332 IN6_ARE_ADDR_EQUAL(&id->dst_ip6,&q->id.dst_ip6) && 1333 IN6_ARE_ADDR_EQUAL(&id->src_ip6,&q->id.src_ip6) && 1334 id->dst_port == q->id.dst_port && 1335 id->src_port == q->id.src_port && 1336 id->proto == q->id.proto && 1337 id->flags == q->id.flags && 1338 id->flow_id6 == q->id.flow_id6) 1339 break ; /* found */ 1340 1341 if (!is_v6 && id->dst_ip == q->id.dst_ip && 1342 id->src_ip == q->id.src_ip && 1343 id->dst_port == q->id.dst_port && 1344 id->src_port == q->id.src_port && 1345 id->proto == q->id.proto && 1346 id->flags == q->id.flags) 1347 break ; /* found */ 1348 1349 /* No match. Check if we can expire the entry */ 1350 if (pipe_expire && q->head == NULL && q->S == q->F+1 ) { 1351 /* entry is idle and not in any heap, expire it */ 1352 struct dn_flow_queue *old_q = q ; 1353 1354 if (prev != NULL) 1355 prev->next = q = q->next ; 1356 else 1357 fs->rq[i] = q = q->next ; 1358 fs->rq_elements-- ; 1359 FREE(old_q, M_DUMMYNET); 1360 continue ; 1361 } 1362 prev = q ; 1363 q = q->next ; 1364 } 1365 if (q && prev != NULL) { /* found and not in front */ 1366 prev->next = q->next ; 1367 q->next = fs->rq[i] ; 1368 fs->rq[i] = q ; 1369 } 1370 } 1371 if (q == NULL) { /* no match, need to allocate a new entry */ 1372 q = create_queue(fs, i); 1373 if (q != NULL) 1374 q->id = *id ; 1375 } 1376 return q ; 1377} 1378 1379static int 1380red_drops(struct dn_flow_set *fs, struct dn_flow_queue *q, int len) 1381{ 1382 /* 1383 * RED algorithm 1384 * 1385 * RED calculates the average queue size (avg) using a low-pass filter 1386 * with an exponential weighted (w_q) moving average: 1387 * avg <- (1-w_q) * avg + w_q * q_size 1388 * where q_size is the queue length (measured in bytes or * packets). 1389 * 1390 * If q_size == 0, we compute the idle time for the link, and set 1391 * avg = (1 - w_q)^(idle/s) 1392 * where s is the time needed for transmitting a medium-sized packet. 1393 * 1394 * Now, if avg < min_th the packet is enqueued. 1395 * If avg > max_th the packet is dropped. Otherwise, the packet is 1396 * dropped with probability P function of avg. 1397 * 1398 */ 1399 1400 int64_t p_b = 0; 1401 /* queue in bytes or packets ? */ 1402 u_int q_size = (fs->flags_fs & DN_QSIZE_IS_BYTES) ? q->len_bytes : q->len; 1403 1404 DPRINTF(("\ndummynet: %d q: %2u ", (int) curr_time, q_size)); 1405 1406 /* average queue size estimation */ 1407 if (q_size != 0) { 1408 /* 1409 * queue is not empty, avg <- avg + (q_size - avg) * w_q 1410 */ 1411 int diff = SCALE(q_size) - q->avg; 1412 int64_t v = SCALE_MUL((int64_t) diff, (int64_t) fs->w_q); 1413 1414 q->avg += (int) v; 1415 } else { 1416 /* 1417 * queue is empty, find for how long the queue has been 1418 * empty and use a lookup table for computing 1419 * (1 - * w_q)^(idle_time/s) where s is the time to send a 1420 * (small) packet. 1421 * XXX check wraps... 1422 */ 1423 if (q->avg) { 1424 u_int t = (curr_time - q->q_time) / fs->lookup_step; 1425 1426 q->avg = (t < fs->lookup_depth) ? 1427 SCALE_MUL(q->avg, fs->w_q_lookup[t]) : 0; 1428 } 1429 } 1430 DPRINTF(("dummynet: avg: %u ", SCALE_VAL(q->avg))); 1431 1432 /* should i drop ? */ 1433 1434 if (q->avg < fs->min_th) { 1435 q->count = -1; 1436 return 0; /* accept packet ; */ 1437 } 1438 if (q->avg >= fs->max_th) { /* average queue >= max threshold */ 1439 if (fs->flags_fs & DN_IS_GENTLE_RED) { 1440 /* 1441 * According to Gentle-RED, if avg is greater than max_th the 1442 * packet is dropped with a probability 1443 * p_b = c_3 * avg - c_4 1444 * where c_3 = (1 - max_p) / max_th, and c_4 = 1 - 2 * max_p 1445 */ 1446 p_b = SCALE_MUL((int64_t) fs->c_3, (int64_t) q->avg) - fs->c_4; 1447 } else { 1448 q->count = -1; 1449 DPRINTF(("dummynet: - drop")); 1450 return 1 ; 1451 } 1452 } else if (q->avg > fs->min_th) { 1453 /* 1454 * we compute p_b using the linear dropping function p_b = c_1 * 1455 * avg - c_2, where c_1 = max_p / (max_th - min_th), and c_2 = 1456 * max_p * min_th / (max_th - min_th) 1457 */ 1458 p_b = SCALE_MUL((int64_t) fs->c_1, (int64_t) q->avg) - fs->c_2; 1459 } 1460 if (fs->flags_fs & DN_QSIZE_IS_BYTES) 1461 p_b = (p_b * len) / fs->max_pkt_size; 1462 if (++q->count == 0) 1463 q->random = MY_RANDOM & 0xffff; 1464 else { 1465 /* 1466 * q->count counts packets arrived since last drop, so a greater 1467 * value of q->count means a greater packet drop probability. 1468 */ 1469 if (SCALE_MUL(p_b, SCALE((int64_t) q->count)) > q->random) { 1470 q->count = 0; 1471 DPRINTF(("dummynet: - red drop")); 1472 /* after a drop we calculate a new random value */ 1473 q->random = MY_RANDOM & 0xffff; 1474 return 1; /* drop */ 1475 } 1476 } 1477 /* end of RED algorithm */ 1478 return 0 ; /* accept */ 1479} 1480 1481static __inline 1482struct dn_flow_set * 1483locate_flowset(int fs_nr) 1484{ 1485 struct dn_flow_set *fs; 1486 SLIST_FOREACH(fs, &flowsethash[HASH(fs_nr)], next) 1487 if (fs->fs_nr == fs_nr) 1488 return fs ; 1489 1490 return (NULL); 1491} 1492 1493static __inline struct dn_pipe * 1494locate_pipe(int pipe_nr) 1495{ 1496 struct dn_pipe *pipe; 1497 1498 SLIST_FOREACH(pipe, &pipehash[HASH(pipe_nr)], next) 1499 if (pipe->pipe_nr == pipe_nr) 1500 return (pipe); 1501 1502 return (NULL); 1503} 1504 1505 1506 1507/* 1508 * dummynet hook for packets. Below 'pipe' is a pipe or a queue 1509 * depending on whether WF2Q or fixed bw is used. 1510 * 1511 * pipe_nr pipe or queue the packet is destined for. 1512 * dir where shall we send the packet after dummynet. 1513 * m the mbuf with the packet 1514 * ifp the 'ifp' parameter from the caller. 1515 * NULL in ip_input, destination interface in ip_output, 1516 * real_dst in bdg_forward 1517 * ro route parameter (only used in ip_output, NULL otherwise) 1518 * dst destination address, only used by ip_output 1519 * rule matching rule, in case of multiple passes 1520 * flags flags from the caller, only used in ip_output 1521 * 1522 */ 1523static int 1524dummynet_io(struct mbuf *m, int pipe_nr, int dir, struct ip_fw_args *fwa, int client) 1525{ 1526 struct mbuf *head = NULL, *tail = NULL; 1527 struct dn_pkt_tag *pkt; 1528 struct m_tag *mtag; 1529 struct dn_flow_set *fs = NULL; 1530 struct dn_pipe *pipe ; 1531 u_int64_t len = m->m_pkthdr.len ; 1532 struct dn_flow_queue *q = NULL ; 1533 int is_pipe = 0; 1534 struct timespec ts; 1535 struct timeval tv; 1536 1537 DPRINTF(("dummynet_io m: 0x%llx pipe: %d dir: %d client: %d\n", 1538 (uint64_t)VM_KERNEL_ADDRPERM(m), pipe_nr, dir, client)); 1539 1540#if IPFIREWALL 1541#if IPFW2 1542 if (client == DN_CLIENT_IPFW) { 1543 ipfw_insn *cmd = fwa->fwa_ipfw_rule->cmd + fwa->fwa_ipfw_rule->act_ofs; 1544 1545 if (cmd->opcode == O_LOG) 1546 cmd += F_LEN(cmd); 1547 is_pipe = (cmd->opcode == O_PIPE); 1548 } 1549#else 1550 if (client == DN_CLIENT_IPFW) 1551 is_pipe = (fwa->fwa_ipfw_rule->fw_flg & IP_FW_F_COMMAND) == IP_FW_F_PIPE; 1552#endif 1553#endif /* IPFIREWALL */ 1554 1555#if DUMMYNET 1556 if (client == DN_CLIENT_PF) 1557 is_pipe = fwa->fwa_flags == DN_IS_PIPE ? 1 : 0; 1558#endif /* DUMMYNET */ 1559 1560 pipe_nr &= 0xffff ; 1561 1562 lck_mtx_lock(dn_mutex); 1563 1564 /* make all time measurements in milliseconds (ms) - 1565 * here we convert secs and usecs to msecs (just divide the 1566 * usecs and take the closest whole number). 1567 */ 1568 microuptime(&tv); 1569 curr_time = (tv.tv_sec * 1000) + (tv.tv_usec / 1000); 1570 1571 /* 1572 * This is a dummynet rule, so we expect an O_PIPE or O_QUEUE rule. 1573 */ 1574 if (is_pipe) { 1575 pipe = locate_pipe(pipe_nr); 1576 if (pipe != NULL) 1577 fs = &(pipe->fs); 1578 } else 1579 fs = locate_flowset(pipe_nr); 1580 1581 1582 if (fs == NULL){ 1583 goto dropit ; /* this queue/pipe does not exist! */ 1584 } 1585 pipe = fs->pipe ; 1586 if (pipe == NULL) { /* must be a queue, try find a matching pipe */ 1587 pipe = locate_pipe(fs->parent_nr); 1588 1589 if (pipe != NULL) 1590 fs->pipe = pipe ; 1591 else { 1592 printf("dummynet: no pipe %d for queue %d, drop pkt\n", 1593 fs->parent_nr, fs->fs_nr); 1594 goto dropit ; 1595 } 1596 } 1597 q = find_queue(fs, &(fwa->fwa_id)); 1598 if ( q == NULL ) 1599 goto dropit ; /* cannot allocate queue */ 1600 /* 1601 * update statistics, then check reasons to drop pkt 1602 */ 1603 q->tot_bytes += len ; 1604 q->tot_pkts++ ; 1605 if ( fs->plr && (MY_RANDOM < fs->plr) ) 1606 goto dropit ; /* random pkt drop */ 1607 if ( fs->flags_fs & DN_QSIZE_IS_BYTES) { 1608 if (q->len_bytes > fs->qsize) 1609 goto dropit ; /* queue size overflow */ 1610 } else { 1611 if (q->len >= fs->qsize) 1612 goto dropit ; /* queue count overflow */ 1613 } 1614 if ( fs->flags_fs & DN_IS_RED && red_drops(fs, q, len) ) 1615 goto dropit ; 1616 1617 /* XXX expensive to zero, see if we can remove it*/ 1618 mtag = m_tag_create(KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, 1619 sizeof(struct dn_pkt_tag), M_NOWAIT, m); 1620 if ( mtag == NULL ) 1621 goto dropit ; /* cannot allocate packet header */ 1622 m_tag_prepend(m, mtag); /* attach to mbuf chain */ 1623 1624 pkt = (struct dn_pkt_tag *)(mtag+1); 1625 bzero(pkt, sizeof(struct dn_pkt_tag)); 1626 /* ok, i can handle the pkt now... */ 1627 /* build and enqueue packet + parameters */ 1628 /* 1629 * PF is checked before ipfw so remember ipfw rule only when 1630 * the caller is ipfw. When the caller is PF, fwa_ipfw_rule 1631 * is a fake rule just used for convenience 1632 */ 1633 if (client == DN_CLIENT_IPFW) 1634 pkt->dn_ipfw_rule = fwa->fwa_ipfw_rule; 1635 pkt->dn_pf_rule = fwa->fwa_pf_rule; 1636 pkt->dn_dir = dir ; 1637 pkt->dn_client = client; 1638 1639 pkt->dn_ifp = fwa->fwa_oif; 1640 if (dir == DN_TO_IP_OUT) { 1641 /* 1642 * We need to copy *ro because for ICMP pkts (and maybe others) 1643 * the caller passed a pointer into the stack; dst might also be 1644 * a pointer into *ro so it needs to be updated. 1645 */ 1646 if (fwa->fwa_ro) { 1647 route_copyout(&pkt->dn_ro, fwa->fwa_ro, sizeof (pkt->dn_ro)); 1648 } 1649 if (fwa->fwa_dst) { 1650 if (fwa->fwa_dst == (struct sockaddr_in *)&fwa->fwa_ro->ro_dst) /* dst points into ro */ 1651 fwa->fwa_dst = (struct sockaddr_in *)&(pkt->dn_ro.ro_dst) ; 1652 1653 bcopy (fwa->fwa_dst, &pkt->dn_dst, sizeof(pkt->dn_dst)); 1654 } 1655 } else if (dir == DN_TO_IP6_OUT) { 1656 if (fwa->fwa_ro6) { 1657 route_copyout((struct route *)&pkt->dn_ro6, 1658 (struct route *)fwa->fwa_ro6, sizeof (pkt->dn_ro6)); 1659 } 1660 if (fwa->fwa_ro6_pmtu) { 1661 route_copyout((struct route *)&pkt->dn_ro6_pmtu, 1662 (struct route *)fwa->fwa_ro6_pmtu, sizeof (pkt->dn_ro6_pmtu)); 1663 } 1664 if (fwa->fwa_dst6) { 1665 if (fwa->fwa_dst6 == (struct sockaddr_in6 *)&fwa->fwa_ro6->ro_dst) /* dst points into ro */ 1666 fwa->fwa_dst6 = (struct sockaddr_in6 *)&(pkt->dn_ro6.ro_dst) ; 1667 1668 bcopy (fwa->fwa_dst6, &pkt->dn_dst6, sizeof(pkt->dn_dst6)); 1669 } 1670 pkt->dn_origifp = fwa->fwa_origifp; 1671 pkt->dn_mtu = fwa->fwa_mtu; 1672 pkt->dn_alwaysfrag = fwa->fwa_alwaysfrag; 1673 pkt->dn_unfragpartlen = fwa->fwa_unfragpartlen; 1674 if (fwa->fwa_exthdrs) { 1675 bcopy (fwa->fwa_exthdrs, &pkt->dn_exthdrs, sizeof(pkt->dn_exthdrs)); 1676 /* 1677 * Need to zero out the source structure so the mbufs 1678 * won't be freed by ip6_output() 1679 */ 1680 bzero(fwa->fwa_exthdrs, sizeof(struct ip6_exthdrs)); 1681 } 1682 } 1683 if (dir == DN_TO_IP_OUT || dir == DN_TO_IP6_OUT) { 1684 pkt->dn_flags = fwa->fwa_oflags; 1685 if (fwa->fwa_ipoa != NULL) 1686 pkt->dn_ipoa = *(fwa->fwa_ipoa); 1687 } 1688 if (q->head == NULL) 1689 q->head = m; 1690 else 1691 q->tail->m_nextpkt = m; 1692 q->tail = m; 1693 q->len++; 1694 q->len_bytes += len ; 1695 1696 if ( q->head != m ) /* flow was not idle, we are done */ 1697 goto done; 1698 /* 1699 * If we reach this point the flow was previously idle, so we need 1700 * to schedule it. This involves different actions for fixed-rate or 1701 * WF2Q queues. 1702 */ 1703 if (is_pipe) { 1704 /* 1705 * Fixed-rate queue: just insert into the ready_heap. 1706 */ 1707 dn_key t = 0 ; 1708 if (pipe->bandwidth) 1709 t = SET_TICKS(m, q, pipe); 1710 q->sched_time = curr_time ; 1711 if (t == 0) /* must process it now */ 1712 ready_event( q , &head, &tail ); 1713 else 1714 heap_insert(&ready_heap, curr_time + t , q ); 1715 } else { 1716 /* 1717 * WF2Q. First, compute start time S: if the flow was idle (S=F+1) 1718 * set S to the virtual time V for the controlling pipe, and update 1719 * the sum of weights for the pipe; otherwise, remove flow from 1720 * idle_heap and set S to max(F,V). 1721 * Second, compute finish time F = S + len/weight. 1722 * Third, if pipe was idle, update V=max(S, V). 1723 * Fourth, count one more backlogged flow. 1724 */ 1725 if (DN_KEY_GT(q->S, q->F)) { /* means timestamps are invalid */ 1726 q->S = pipe->V ; 1727 pipe->sum += fs->weight ; /* add weight of new queue */ 1728 } else { 1729 heap_extract(&(pipe->idle_heap), q); 1730 q->S = MAX64(q->F, pipe->V ) ; 1731 } 1732 q->F = q->S + ( len<<MY_M )/(u_int64_t) fs->weight; 1733 1734 if (pipe->not_eligible_heap.elements == 0 && 1735 pipe->scheduler_heap.elements == 0) 1736 pipe->V = MAX64 ( q->S, pipe->V ); 1737 fs->backlogged++ ; 1738 /* 1739 * Look at eligibility. A flow is not eligibile if S>V (when 1740 * this happens, it means that there is some other flow already 1741 * scheduled for the same pipe, so the scheduler_heap cannot be 1742 * empty). If the flow is not eligible we just store it in the 1743 * not_eligible_heap. Otherwise, we store in the scheduler_heap 1744 * and possibly invoke ready_event_wfq() right now if there is 1745 * leftover credit. 1746 * Note that for all flows in scheduler_heap (SCH), S_i <= V, 1747 * and for all flows in not_eligible_heap (NEH), S_i > V . 1748 * So when we need to compute max( V, min(S_i) ) forall i in SCH+NEH, 1749 * we only need to look into NEH. 1750 */ 1751 if (DN_KEY_GT(q->S, pipe->V) ) { /* not eligible */ 1752 if (pipe->scheduler_heap.elements == 0) 1753 printf("dummynet: ++ ouch! not eligible but empty scheduler!\n"); 1754 heap_insert(&(pipe->not_eligible_heap), q->S, q); 1755 } else { 1756 heap_insert(&(pipe->scheduler_heap), q->F, q); 1757 if (pipe->numbytes >= 0) { /* pipe is idle */ 1758 if (pipe->scheduler_heap.elements != 1) 1759 printf("dummynet: OUCH! pipe should have been idle!\n"); 1760 DPRINTF(("dummynet: waking up pipe %d at %d\n", 1761 pipe->pipe_nr, (int)(q->F >> MY_M))); 1762 pipe->sched_time = curr_time ; 1763 ready_event_wfq(pipe, &head, &tail); 1764 } 1765 } 1766 } 1767done: 1768 /* start the timer and set global if not already set */ 1769 if (!timer_enabled) { 1770 ts.tv_sec = 0; 1771 ts.tv_nsec = 1 * 1000000; // 1ms 1772 timer_enabled = 1; 1773 bsd_timeout(dummynet, NULL, &ts); 1774 } 1775 1776 lck_mtx_unlock(dn_mutex); 1777 1778 if (head != NULL) { 1779 dummynet_send(head); 1780 } 1781 1782 return 0; 1783 1784dropit: 1785 if (q) 1786 q->drops++ ; 1787 lck_mtx_unlock(dn_mutex); 1788 m_freem(m); 1789 return ( (fs && (fs->flags_fs & DN_NOERROR)) ? 0 : ENOBUFS); 1790} 1791 1792/* 1793 * Below, the ROUTE_RELEASE is only needed when (pkt->dn_dir == DN_TO_IP_OUT) 1794 * Doing this would probably save us the initial bzero of dn_pkt 1795 */ 1796#define DN_FREE_PKT(_m) do { \ 1797 struct m_tag *tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, KERNEL_TAG_TYPE_DUMMYNET, NULL); \ 1798 if (tag) { \ 1799 struct dn_pkt_tag *n = (struct dn_pkt_tag *)(tag+1); \ 1800 ROUTE_RELEASE(&n->dn_ro); \ 1801 } \ 1802 m_tag_delete(_m, tag); \ 1803 m_freem(_m); \ 1804} while (0) 1805 1806/* 1807 * Dispose all packets and flow_queues on a flow_set. 1808 * If all=1, also remove red lookup table and other storage, 1809 * including the descriptor itself. 1810 * For the one in dn_pipe MUST also cleanup ready_heap... 1811 */ 1812static void 1813purge_flow_set(struct dn_flow_set *fs, int all) 1814{ 1815 struct dn_flow_queue *q, *qn ; 1816 int i ; 1817 1818 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 1819 1820 for (i = 0 ; i <= fs->rq_size ; i++ ) { 1821 for (q = fs->rq[i] ; q ; q = qn ) { 1822 struct mbuf *m, *mnext; 1823 1824 mnext = q->head; 1825 while ((m = mnext) != NULL) { 1826 mnext = m->m_nextpkt; 1827 DN_FREE_PKT(m); 1828 } 1829 qn = q->next ; 1830 FREE(q, M_DUMMYNET); 1831 } 1832 fs->rq[i] = NULL ; 1833 } 1834 fs->rq_elements = 0 ; 1835 if (all) { 1836 /* RED - free lookup table */ 1837 if (fs->w_q_lookup) 1838 FREE(fs->w_q_lookup, M_DUMMYNET); 1839 if (fs->rq) 1840 FREE(fs->rq, M_DUMMYNET); 1841 /* if this fs is not part of a pipe, free it */ 1842 if (fs->pipe && fs != &(fs->pipe->fs) ) 1843 FREE(fs, M_DUMMYNET); 1844 } 1845} 1846 1847/* 1848 * Dispose all packets queued on a pipe (not a flow_set). 1849 * Also free all resources associated to a pipe, which is about 1850 * to be deleted. 1851 */ 1852static void 1853purge_pipe(struct dn_pipe *pipe) 1854{ 1855 struct mbuf *m, *mnext; 1856 1857 purge_flow_set( &(pipe->fs), 1 ); 1858 1859 mnext = pipe->head; 1860 while ((m = mnext) != NULL) { 1861 mnext = m->m_nextpkt; 1862 DN_FREE_PKT(m); 1863 } 1864 1865 heap_free( &(pipe->scheduler_heap) ); 1866 heap_free( &(pipe->not_eligible_heap) ); 1867 heap_free( &(pipe->idle_heap) ); 1868} 1869 1870/* 1871 * Delete all pipes and heaps returning memory. Must also 1872 * remove references from all ipfw rules to all pipes. 1873 */ 1874static void 1875dummynet_flush(void) 1876{ 1877 struct dn_pipe *pipe, *pipe1; 1878 struct dn_flow_set *fs, *fs1; 1879 int i; 1880 1881 lck_mtx_lock(dn_mutex); 1882 1883#if IPFW2 1884 /* remove all references to pipes ...*/ 1885 flush_pipe_ptrs(NULL); 1886#endif /* IPFW2 */ 1887 1888 /* Free heaps so we don't have unwanted events. */ 1889 heap_free(&ready_heap); 1890 heap_free(&wfq_ready_heap); 1891 heap_free(&extract_heap); 1892 1893 /* 1894 * Now purge all queued pkts and delete all pipes. 1895 * 1896 * XXXGL: can we merge the for(;;) cycles into one or not? 1897 */ 1898 for (i = 0; i < HASHSIZE; i++) 1899 SLIST_FOREACH_SAFE(fs, &flowsethash[i], next, fs1) { 1900 SLIST_REMOVE(&flowsethash[i], fs, dn_flow_set, next); 1901 purge_flow_set(fs, 1); 1902 } 1903 for (i = 0; i < HASHSIZE; i++) 1904 SLIST_FOREACH_SAFE(pipe, &pipehash[i], next, pipe1) { 1905 SLIST_REMOVE(&pipehash[i], pipe, dn_pipe, next); 1906 purge_pipe(pipe); 1907 FREE(pipe, M_DUMMYNET); 1908 } 1909 lck_mtx_unlock(dn_mutex); 1910} 1911 1912 1913static void 1914dn_ipfw_rule_delete_fs(struct dn_flow_set *fs, void *r) 1915{ 1916 int i ; 1917 struct dn_flow_queue *q ; 1918 struct mbuf *m ; 1919 1920 for (i = 0 ; i <= fs->rq_size ; i++) /* last one is ovflow */ 1921 for (q = fs->rq[i] ; q ; q = q->next ) 1922 for (m = q->head ; m ; m = m->m_nextpkt ) { 1923 struct dn_pkt_tag *pkt = dn_tag_get(m) ; 1924 if (pkt->dn_ipfw_rule == r) 1925 pkt->dn_ipfw_rule = &default_rule ; 1926 } 1927} 1928/* 1929 * when a firewall rule is deleted, scan all queues and remove the flow-id 1930 * from packets matching this rule. 1931 */ 1932void 1933dn_ipfw_rule_delete(void *r) 1934{ 1935 struct dn_pipe *p ; 1936 struct dn_flow_set *fs ; 1937 struct dn_pkt_tag *pkt ; 1938 struct mbuf *m ; 1939 int i; 1940 1941 lck_mtx_lock(dn_mutex); 1942 1943 /* 1944 * If the rule references a queue (dn_flow_set), then scan 1945 * the flow set, otherwise scan pipes. Should do either, but doing 1946 * both does not harm. 1947 */ 1948 for (i = 0; i < HASHSIZE; i++) 1949 SLIST_FOREACH(fs, &flowsethash[i], next) 1950 dn_ipfw_rule_delete_fs(fs, r); 1951 1952 for (i = 0; i < HASHSIZE; i++) 1953 SLIST_FOREACH(p, &pipehash[i], next) { 1954 fs = &(p->fs); 1955 dn_ipfw_rule_delete_fs(fs, r); 1956 for (m = p->head ; m ; m = m->m_nextpkt ) { 1957 pkt = dn_tag_get(m); 1958 if (pkt->dn_ipfw_rule == r) 1959 pkt->dn_ipfw_rule = &default_rule; 1960 } 1961 } 1962 lck_mtx_unlock(dn_mutex); 1963} 1964 1965/* 1966 * setup RED parameters 1967 */ 1968static int 1969config_red(struct dn_flow_set *p, struct dn_flow_set * x) 1970{ 1971 int i; 1972 1973 x->w_q = p->w_q; 1974 x->min_th = SCALE(p->min_th); 1975 x->max_th = SCALE(p->max_th); 1976 x->max_p = p->max_p; 1977 1978 x->c_1 = p->max_p / (p->max_th - p->min_th); 1979 x->c_2 = SCALE_MUL(x->c_1, SCALE(p->min_th)); 1980 if (x->flags_fs & DN_IS_GENTLE_RED) { 1981 x->c_3 = (SCALE(1) - p->max_p) / p->max_th; 1982 x->c_4 = (SCALE(1) - 2 * p->max_p); 1983 } 1984 1985 /* if the lookup table already exist, free and create it again */ 1986 if (x->w_q_lookup) { 1987 FREE(x->w_q_lookup, M_DUMMYNET); 1988 x->w_q_lookup = NULL ; 1989 } 1990 if (red_lookup_depth == 0) { 1991 printf("\ndummynet: net.inet.ip.dummynet.red_lookup_depth must be > 0\n"); 1992 FREE(x, M_DUMMYNET); 1993 return EINVAL; 1994 } 1995 x->lookup_depth = red_lookup_depth; 1996 x->w_q_lookup = (u_int *) _MALLOC(x->lookup_depth * sizeof(int), 1997 M_DUMMYNET, M_DONTWAIT); 1998 if (x->w_q_lookup == NULL) { 1999 printf("dummynet: sorry, cannot allocate red lookup table\n"); 2000 FREE(x, M_DUMMYNET); 2001 return ENOSPC; 2002 } 2003 2004 /* fill the lookup table with (1 - w_q)^x */ 2005 x->lookup_step = p->lookup_step ; 2006 x->lookup_weight = p->lookup_weight ; 2007 x->w_q_lookup[0] = SCALE(1) - x->w_q; 2008 for (i = 1; i < x->lookup_depth; i++) 2009 x->w_q_lookup[i] = SCALE_MUL(x->w_q_lookup[i - 1], x->lookup_weight); 2010 if (red_avg_pkt_size < 1) 2011 red_avg_pkt_size = 512 ; 2012 x->avg_pkt_size = red_avg_pkt_size ; 2013 if (red_max_pkt_size < 1) 2014 red_max_pkt_size = 1500 ; 2015 x->max_pkt_size = red_max_pkt_size ; 2016 return 0 ; 2017} 2018 2019static int 2020alloc_hash(struct dn_flow_set *x, struct dn_flow_set *pfs) 2021{ 2022 if (x->flags_fs & DN_HAVE_FLOW_MASK) { /* allocate some slots */ 2023 int l = pfs->rq_size; 2024 2025 if (l == 0) 2026 l = dn_hash_size; 2027 if (l < 4) 2028 l = 4; 2029 else if (l > DN_MAX_HASH_SIZE) 2030 l = DN_MAX_HASH_SIZE; 2031 x->rq_size = l; 2032 } else /* one is enough for null mask */ 2033 x->rq_size = 1; 2034 x->rq = _MALLOC((1 + x->rq_size) * sizeof(struct dn_flow_queue *), 2035 M_DUMMYNET, M_DONTWAIT | M_ZERO); 2036 if (x->rq == NULL) { 2037 printf("dummynet: sorry, cannot allocate queue\n"); 2038 return ENOSPC; 2039 } 2040 x->rq_elements = 0; 2041 return 0 ; 2042} 2043 2044static void 2045set_fs_parms(struct dn_flow_set *x, struct dn_flow_set *src) 2046{ 2047 x->flags_fs = src->flags_fs; 2048 x->qsize = src->qsize; 2049 x->plr = src->plr; 2050 x->flow_mask = src->flow_mask; 2051 if (x->flags_fs & DN_QSIZE_IS_BYTES) { 2052 if (x->qsize > 1024*1024) 2053 x->qsize = 1024*1024 ; 2054 } else { 2055 if (x->qsize == 0) 2056 x->qsize = 50 ; 2057 if (x->qsize > 100) 2058 x->qsize = 50 ; 2059 } 2060 /* configuring RED */ 2061 if ( x->flags_fs & DN_IS_RED ) 2062 config_red(src, x) ; /* XXX should check errors */ 2063} 2064 2065/* 2066 * setup pipe or queue parameters. 2067 */ 2068 2069static int 2070config_pipe(struct dn_pipe *p) 2071{ 2072 int i, r; 2073 struct dn_flow_set *pfs = &(p->fs); 2074 struct dn_flow_queue *q; 2075 2076 /* 2077 * The config program passes parameters as follows: 2078 * bw = bits/second (0 means no limits), 2079 * delay = ms, must be translated into ticks. 2080 * qsize = slots/bytes 2081 */ 2082 p->delay = ( p->delay * (hz*10) ) / 1000 ; 2083 /* We need either a pipe number or a flow_set number */ 2084 if (p->pipe_nr == 0 && pfs->fs_nr == 0) 2085 return EINVAL ; 2086 if (p->pipe_nr != 0 && pfs->fs_nr != 0) 2087 return EINVAL ; 2088 if (p->pipe_nr != 0) { /* this is a pipe */ 2089 struct dn_pipe *x, *b; 2090 2091 lck_mtx_lock(dn_mutex); 2092 2093 /* locate pipe */ 2094 b = locate_pipe(p->pipe_nr); 2095 2096 if (b == NULL || b->pipe_nr != p->pipe_nr) { /* new pipe */ 2097 x = _MALLOC(sizeof(struct dn_pipe), M_DUMMYNET, M_DONTWAIT | M_ZERO) ; 2098 if (x == NULL) { 2099 lck_mtx_unlock(dn_mutex); 2100 printf("dummynet: no memory for new pipe\n"); 2101 return ENOSPC; 2102 } 2103 x->pipe_nr = p->pipe_nr; 2104 x->fs.pipe = x ; 2105 /* idle_heap is the only one from which we extract from the middle. 2106 */ 2107 x->idle_heap.size = x->idle_heap.elements = 0 ; 2108 x->idle_heap.offset=offsetof(struct dn_flow_queue, heap_pos); 2109 } else { 2110 x = b; 2111 /* Flush accumulated credit for all queues */ 2112 for (i = 0; i <= x->fs.rq_size; i++) 2113 for (q = x->fs.rq[i]; q; q = q->next) 2114 q->numbytes = 0; 2115 } 2116 2117 x->bandwidth = p->bandwidth ; 2118 x->numbytes = 0; /* just in case... */ 2119 bcopy(p->if_name, x->if_name, sizeof(p->if_name) ); 2120 x->ifp = NULL ; /* reset interface ptr */ 2121 x->delay = p->delay ; 2122 set_fs_parms(&(x->fs), pfs); 2123 2124 2125 if ( x->fs.rq == NULL ) { /* a new pipe */ 2126 r = alloc_hash(&(x->fs), pfs) ; 2127 if (r) { 2128 lck_mtx_unlock(dn_mutex); 2129 FREE(x, M_DUMMYNET); 2130 return r ; 2131 } 2132 SLIST_INSERT_HEAD(&pipehash[HASH(x->pipe_nr)], 2133 x, next); 2134 } 2135 lck_mtx_unlock(dn_mutex); 2136 } else { /* config queue */ 2137 struct dn_flow_set *x, *b ; 2138 2139 lck_mtx_lock(dn_mutex); 2140 /* locate flow_set */ 2141 b = locate_flowset(pfs->fs_nr); 2142 2143 if (b == NULL || b->fs_nr != pfs->fs_nr) { /* new */ 2144 if (pfs->parent_nr == 0) { /* need link to a pipe */ 2145 lck_mtx_unlock(dn_mutex); 2146 return EINVAL ; 2147 } 2148 x = _MALLOC(sizeof(struct dn_flow_set), M_DUMMYNET, M_DONTWAIT | M_ZERO); 2149 if (x == NULL) { 2150 lck_mtx_unlock(dn_mutex); 2151 printf("dummynet: no memory for new flow_set\n"); 2152 return ENOSPC; 2153 } 2154 x->fs_nr = pfs->fs_nr; 2155 x->parent_nr = pfs->parent_nr; 2156 x->weight = pfs->weight ; 2157 if (x->weight == 0) 2158 x->weight = 1 ; 2159 else if (x->weight > 100) 2160 x->weight = 100 ; 2161 } else { 2162 /* Change parent pipe not allowed; must delete and recreate */ 2163 if (pfs->parent_nr != 0 && b->parent_nr != pfs->parent_nr) { 2164 lck_mtx_unlock(dn_mutex); 2165 return EINVAL ; 2166 } 2167 x = b; 2168 } 2169 set_fs_parms(x, pfs); 2170 2171 if ( x->rq == NULL ) { /* a new flow_set */ 2172 r = alloc_hash(x, pfs) ; 2173 if (r) { 2174 lck_mtx_unlock(dn_mutex); 2175 FREE(x, M_DUMMYNET); 2176 return r ; 2177 } 2178 SLIST_INSERT_HEAD(&flowsethash[HASH(x->fs_nr)], 2179 x, next); 2180 } 2181 lck_mtx_unlock(dn_mutex); 2182 } 2183 return 0 ; 2184} 2185 2186/* 2187 * Helper function to remove from a heap queues which are linked to 2188 * a flow_set about to be deleted. 2189 */ 2190static void 2191fs_remove_from_heap(struct dn_heap *h, struct dn_flow_set *fs) 2192{ 2193 int i = 0, found = 0 ; 2194 for (; i < h->elements ;) 2195 if ( ((struct dn_flow_queue *)h->p[i].object)->fs == fs) { 2196 h->elements-- ; 2197 h->p[i] = h->p[h->elements] ; 2198 found++ ; 2199 } else 2200 i++ ; 2201 if (found) 2202 heapify(h); 2203} 2204 2205/* 2206 * helper function to remove a pipe from a heap (can be there at most once) 2207 */ 2208static void 2209pipe_remove_from_heap(struct dn_heap *h, struct dn_pipe *p) 2210{ 2211 if (h->elements > 0) { 2212 int i = 0 ; 2213 for (i=0; i < h->elements ; i++ ) { 2214 if (h->p[i].object == p) { /* found it */ 2215 h->elements-- ; 2216 h->p[i] = h->p[h->elements] ; 2217 heapify(h); 2218 break ; 2219 } 2220 } 2221 } 2222} 2223 2224/* 2225 * drain all queues. Called in case of severe mbuf shortage. 2226 */ 2227void 2228dummynet_drain(void) 2229{ 2230 struct dn_flow_set *fs; 2231 struct dn_pipe *p; 2232 struct mbuf *m, *mnext; 2233 int i; 2234 2235 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 2236 2237 heap_free(&ready_heap); 2238 heap_free(&wfq_ready_heap); 2239 heap_free(&extract_heap); 2240 /* remove all references to this pipe from flow_sets */ 2241 for (i = 0; i < HASHSIZE; i++) 2242 SLIST_FOREACH(fs, &flowsethash[i], next) 2243 purge_flow_set(fs, 0); 2244 2245 for (i = 0; i < HASHSIZE; i++) 2246 SLIST_FOREACH(p, &pipehash[i], next) { 2247 purge_flow_set(&(p->fs), 0); 2248 2249 mnext = p->head; 2250 while ((m = mnext) != NULL) { 2251 mnext = m->m_nextpkt; 2252 DN_FREE_PKT(m); 2253 } 2254 p->head = p->tail = NULL ; 2255 } 2256} 2257 2258/* 2259 * Fully delete a pipe or a queue, cleaning up associated info. 2260 */ 2261static int 2262delete_pipe(struct dn_pipe *p) 2263{ 2264 if (p->pipe_nr == 0 && p->fs.fs_nr == 0) 2265 return EINVAL ; 2266 if (p->pipe_nr != 0 && p->fs.fs_nr != 0) 2267 return EINVAL ; 2268 if (p->pipe_nr != 0) { /* this is an old-style pipe */ 2269 struct dn_pipe *b; 2270 struct dn_flow_set *fs; 2271 int i; 2272 2273 lck_mtx_lock(dn_mutex); 2274 /* locate pipe */ 2275 b = locate_pipe(p->pipe_nr); 2276 if(b == NULL){ 2277 lck_mtx_unlock(dn_mutex); 2278 return EINVAL ; /* not found */ 2279 } 2280 2281 /* Unlink from list of pipes. */ 2282 SLIST_REMOVE(&pipehash[HASH(b->pipe_nr)], b, dn_pipe, next); 2283 2284#if IPFW2 2285 /* remove references to this pipe from the ip_fw rules. */ 2286 flush_pipe_ptrs(&(b->fs)); 2287#endif /* IPFW2 */ 2288 2289 /* Remove all references to this pipe from flow_sets. */ 2290 for (i = 0; i < HASHSIZE; i++) 2291 SLIST_FOREACH(fs, &flowsethash[i], next) 2292 if (fs->pipe == b) { 2293 printf("dummynet: ++ ref to pipe %d from fs %d\n", 2294 p->pipe_nr, fs->fs_nr); 2295 fs->pipe = NULL ; 2296 purge_flow_set(fs, 0); 2297 } 2298 fs_remove_from_heap(&ready_heap, &(b->fs)); 2299 2300 purge_pipe(b); /* remove all data associated to this pipe */ 2301 /* remove reference to here from extract_heap and wfq_ready_heap */ 2302 pipe_remove_from_heap(&extract_heap, b); 2303 pipe_remove_from_heap(&wfq_ready_heap, b); 2304 lck_mtx_unlock(dn_mutex); 2305 2306 FREE(b, M_DUMMYNET); 2307 } else { /* this is a WF2Q queue (dn_flow_set) */ 2308 struct dn_flow_set *b; 2309 2310 lck_mtx_lock(dn_mutex); 2311 /* locate set */ 2312 b = locate_flowset(p->fs.fs_nr); 2313 if (b == NULL) { 2314 lck_mtx_unlock(dn_mutex); 2315 return EINVAL ; /* not found */ 2316 } 2317 2318#if IPFW2 2319 /* remove references to this flow_set from the ip_fw rules. */ 2320 flush_pipe_ptrs(b); 2321#endif /* IPFW2 */ 2322 2323 /* Unlink from list of flowsets. */ 2324 SLIST_REMOVE( &flowsethash[HASH(b->fs_nr)], b, dn_flow_set, next); 2325 2326 if (b->pipe != NULL) { 2327 /* Update total weight on parent pipe and cleanup parent heaps */ 2328 b->pipe->sum -= b->weight * b->backlogged ; 2329 fs_remove_from_heap(&(b->pipe->not_eligible_heap), b); 2330 fs_remove_from_heap(&(b->pipe->scheduler_heap), b); 2331#if 1 /* XXX should i remove from idle_heap as well ? */ 2332 fs_remove_from_heap(&(b->pipe->idle_heap), b); 2333#endif 2334 } 2335 purge_flow_set(b, 1); 2336 lck_mtx_unlock(dn_mutex); 2337 } 2338 return 0 ; 2339} 2340 2341/* 2342 * helper function used to copy data from kernel in DUMMYNET_GET 2343 */ 2344static 2345char* dn_copy_set_32(struct dn_flow_set *set, char *bp) 2346{ 2347 int i, copied = 0 ; 2348 struct dn_flow_queue *q; 2349 struct dn_flow_queue_32 *qp = (struct dn_flow_queue_32 *)bp; 2350 2351 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 2352 2353 for (i = 0 ; i <= set->rq_size ; i++) 2354 for (q = set->rq[i] ; q ; q = q->next, qp++ ) { 2355 if (q->hash_slot != i) 2356 printf("dummynet: ++ at %d: wrong slot (have %d, " 2357 "should be %d)\n", copied, q->hash_slot, i); 2358 if (q->fs != set) 2359 printf("dummynet: ++ at %d: wrong fs ptr " 2360 "(have 0x%llx, should be 0x%llx)\n", i, 2361 (uint64_t)VM_KERNEL_ADDRPERM(q->fs), 2362 (uint64_t)VM_KERNEL_ADDRPERM(set)); 2363 copied++ ; 2364 cp_queue_to_32_user( q, qp ); 2365 /* cleanup pointers */ 2366 qp->next = (user32_addr_t)0 ; 2367 qp->head = qp->tail = (user32_addr_t)0 ; 2368 qp->fs = (user32_addr_t)0 ; 2369 } 2370 if (copied != set->rq_elements) 2371 printf("dummynet: ++ wrong count, have %d should be %d\n", 2372 copied, set->rq_elements); 2373 return (char *)qp ; 2374} 2375 2376static 2377char* dn_copy_set_64(struct dn_flow_set *set, char *bp) 2378{ 2379 int i, copied = 0 ; 2380 struct dn_flow_queue *q; 2381 struct dn_flow_queue_64 *qp = (struct dn_flow_queue_64 *)bp; 2382 2383 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 2384 2385 for (i = 0 ; i <= set->rq_size ; i++) 2386 for (q = set->rq[i] ; q ; q = q->next, qp++ ) { 2387 if (q->hash_slot != i) 2388 printf("dummynet: ++ at %d: wrong slot (have %d, " 2389 "should be %d)\n", copied, q->hash_slot, i); 2390 if (q->fs != set) 2391 printf("dummynet: ++ at %d: wrong fs ptr " 2392 "(have 0x%llx, should be 0x%llx)\n", i, 2393 (uint64_t)VM_KERNEL_ADDRPERM(q->fs), 2394 (uint64_t)VM_KERNEL_ADDRPERM(set)); 2395 copied++ ; 2396 //bcopy(q, qp, sizeof(*q)); 2397 cp_queue_to_64_user( q, qp ); 2398 /* cleanup pointers */ 2399 qp->next = USER_ADDR_NULL ; 2400 qp->head = qp->tail = USER_ADDR_NULL ; 2401 qp->fs = USER_ADDR_NULL ; 2402 } 2403 if (copied != set->rq_elements) 2404 printf("dummynet: ++ wrong count, have %d should be %d\n", 2405 copied, set->rq_elements); 2406 return (char *)qp ; 2407} 2408 2409static size_t 2410dn_calc_size(int is64user) 2411{ 2412 struct dn_flow_set *set ; 2413 struct dn_pipe *p ; 2414 size_t size = 0 ; 2415 size_t pipesize; 2416 size_t queuesize; 2417 size_t setsize; 2418 int i; 2419 2420 lck_mtx_assert(dn_mutex, LCK_MTX_ASSERT_OWNED); 2421 if ( is64user ){ 2422 pipesize = sizeof(struct dn_pipe_64); 2423 queuesize = sizeof(struct dn_flow_queue_64); 2424 setsize = sizeof(struct dn_flow_set_64); 2425 } 2426 else { 2427 pipesize = sizeof(struct dn_pipe_32); 2428 queuesize = sizeof( struct dn_flow_queue_32 ); 2429 setsize = sizeof(struct dn_flow_set_32); 2430 } 2431 /* 2432 * compute size of data structures: list of pipes and flow_sets. 2433 */ 2434 for (i = 0; i < HASHSIZE; i++) { 2435 SLIST_FOREACH(p, &pipehash[i], next) 2436 size += sizeof(*p) + 2437 p->fs.rq_elements * sizeof(struct dn_flow_queue); 2438 SLIST_FOREACH(set, &flowsethash[i], next) 2439 size += sizeof (*set) + 2440 set->rq_elements * sizeof(struct dn_flow_queue); 2441 } 2442 return size; 2443} 2444 2445static int 2446dummynet_get(struct sockopt *sopt) 2447{ 2448 char *buf, *bp=NULL; /* bp is the "copy-pointer" */ 2449 size_t size ; 2450 struct dn_flow_set *set ; 2451 struct dn_pipe *p ; 2452 int error=0, i ; 2453 int is64user = 0; 2454 2455 /* XXX lock held too long */ 2456 lck_mtx_lock(dn_mutex); 2457 /* 2458 * XXX: Ugly, but we need to allocate memory with M_WAITOK flag and we 2459 * cannot use this flag while holding a mutex. 2460 */ 2461 if (proc_is64bit(sopt->sopt_p)) 2462 is64user = 1; 2463 for (i = 0; i < 10; i++) { 2464 size = dn_calc_size(is64user); 2465 lck_mtx_unlock(dn_mutex); 2466 buf = _MALLOC(size, M_TEMP, M_WAITOK); 2467 if (buf == NULL) 2468 return ENOBUFS; 2469 lck_mtx_lock(dn_mutex); 2470 if (size == dn_calc_size(is64user)) 2471 break; 2472 FREE(buf, M_TEMP); 2473 buf = NULL; 2474 } 2475 if (buf == NULL) { 2476 lck_mtx_unlock(dn_mutex); 2477 return ENOBUFS ; 2478 } 2479 2480 2481 bp = buf; 2482 for (i = 0; i < HASHSIZE; i++) 2483 SLIST_FOREACH(p, &pipehash[i], next) { 2484 /* 2485 * copy pipe descriptor into *bp, convert delay back to ms, 2486 * then copy the flow_set descriptor(s) one at a time. 2487 * After each flow_set, copy the queue descriptor it owns. 2488 */ 2489 if ( is64user ){ 2490 bp = cp_pipe_to_64_user(p, (struct dn_pipe_64 *)bp); 2491 } 2492 else{ 2493 bp = cp_pipe_to_32_user(p, (struct dn_pipe_32 *)bp); 2494 } 2495 } 2496 for (i = 0; i < HASHSIZE; i++) 2497 SLIST_FOREACH(set, &flowsethash[i], next) { 2498 struct dn_flow_set_64 *fs_bp = (struct dn_flow_set_64 *)bp ; 2499 cp_flow_set_to_64_user(set, fs_bp); 2500 /* XXX same hack as above */ 2501 fs_bp->next = CAST_DOWN(user64_addr_t, DN_IS_QUEUE); 2502 fs_bp->pipe = USER_ADDR_NULL; 2503 fs_bp->rq = USER_ADDR_NULL ; 2504 bp += sizeof(struct dn_flow_set_64); 2505 bp = dn_copy_set_64( set, bp ); 2506 } 2507 lck_mtx_unlock(dn_mutex); 2508 2509 error = sooptcopyout(sopt, buf, size); 2510 FREE(buf, M_TEMP); 2511 return error ; 2512} 2513 2514/* 2515 * Handler for the various dummynet socket options (get, flush, config, del) 2516 */ 2517static int 2518ip_dn_ctl(struct sockopt *sopt) 2519{ 2520 int error = 0 ; 2521 struct dn_pipe *p, tmp_pipe; 2522 2523 /* Disallow sets in really-really secure mode. */ 2524 if (sopt->sopt_dir == SOPT_SET && securelevel >= 3) 2525 return (EPERM); 2526 2527 switch (sopt->sopt_name) { 2528 default : 2529 printf("dummynet: -- unknown option %d", sopt->sopt_name); 2530 return EINVAL ; 2531 2532 case IP_DUMMYNET_GET : 2533 error = dummynet_get(sopt); 2534 break ; 2535 2536 case IP_DUMMYNET_FLUSH : 2537 dummynet_flush() ; 2538 break ; 2539 2540 case IP_DUMMYNET_CONFIGURE : 2541 p = &tmp_pipe ; 2542 if (proc_is64bit(sopt->sopt_p)) 2543 error = cp_pipe_from_user_64( sopt, p ); 2544 else 2545 error = cp_pipe_from_user_32( sopt, p ); 2546 2547 if (error) 2548 break ; 2549 error = config_pipe(p); 2550 break ; 2551 2552 case IP_DUMMYNET_DEL : /* remove a pipe or queue */ 2553 p = &tmp_pipe ; 2554 if (proc_is64bit(sopt->sopt_p)) 2555 error = cp_pipe_from_user_64( sopt, p ); 2556 else 2557 error = cp_pipe_from_user_32( sopt, p ); 2558 if (error) 2559 break ; 2560 2561 error = delete_pipe(p); 2562 break ; 2563 } 2564 return error ; 2565} 2566 2567void 2568ip_dn_init(void) 2569{ 2570 /* setup locks */ 2571 dn_mutex_grp_attr = lck_grp_attr_alloc_init(); 2572 dn_mutex_grp = lck_grp_alloc_init("dn", dn_mutex_grp_attr); 2573 dn_mutex_attr = lck_attr_alloc_init(); 2574 lck_mtx_init(dn_mutex, dn_mutex_grp, dn_mutex_attr); 2575 2576 ready_heap.size = ready_heap.elements = 0 ; 2577 ready_heap.offset = 0 ; 2578 2579 wfq_ready_heap.size = wfq_ready_heap.elements = 0 ; 2580 wfq_ready_heap.offset = 0 ; 2581 2582 extract_heap.size = extract_heap.elements = 0 ; 2583 extract_heap.offset = 0 ; 2584 ip_dn_ctl_ptr = ip_dn_ctl; 2585 ip_dn_io_ptr = dummynet_io; 2586 2587 bzero(&default_rule, sizeof default_rule); 2588 2589 default_rule.act_ofs = 0; 2590 default_rule.rulenum = IPFW_DEFAULT_RULE; 2591 default_rule.cmd_len = 1; 2592 default_rule.set = RESVD_SET; 2593 2594 default_rule.cmd[0].len = 1; 2595 default_rule.cmd[0].opcode = 2596#ifdef IPFIREWALL_DEFAULT_TO_ACCEPT 2597 (1) ? O_ACCEPT : 2598#endif 2599 O_DENY; 2600} 2601