dummynet.c revision 205050
1/* 2 * Copyright (c) 2002-2003,2010 Luigi Rizzo 3 * 4 * Redistribution and use in source forms, with and without modification, 5 * are permitted provided that this entire comment appears intact. 6 * 7 * Redistribution in binary form may occur without any restrictions. 8 * Obviously, it would be nice if you gave credit where credit is due 9 * but requiring it would be too onerous. 10 * 11 * This software is provided ``AS IS'' without any warranties of any kind. 12 * 13 * $FreeBSD: head/sbin/ipfw/dummynet.c 205050 2010-03-11 22:42:33Z luigi $ 14 * 15 * dummynet support 16 */ 17 18#include <sys/types.h> 19#include <sys/socket.h> 20/* XXX there are several sysctl leftover here */ 21#include <sys/sysctl.h> 22 23#include "ipfw2.h" 24 25#include <ctype.h> 26#include <err.h> 27#include <errno.h> 28#include <libutil.h> 29#include <netdb.h> 30#include <stdio.h> 31#include <stdlib.h> 32#include <string.h> 33#include <sysexits.h> 34 35#include <net/if.h> 36#include <netinet/in.h> 37#include <netinet/ip_fw.h> 38#include <netinet/ip_dummynet.h> 39#include <arpa/inet.h> /* inet_ntoa */ 40 41 42static struct _s_x dummynet_params[] = { 43 { "plr", TOK_PLR }, 44 { "noerror", TOK_NOERROR }, 45 { "buckets", TOK_BUCKETS }, 46 { "dst-ip", TOK_DSTIP }, 47 { "src-ip", TOK_SRCIP }, 48 { "dst-port", TOK_DSTPORT }, 49 { "src-port", TOK_SRCPORT }, 50 { "proto", TOK_PROTO }, 51 { "weight", TOK_WEIGHT }, 52 { "lmax", TOK_LMAX }, 53 { "maxlen", TOK_LMAX }, 54 { "all", TOK_ALL }, 55 { "mask", TOK_MASK }, /* alias for both */ 56 { "sched_mask", TOK_SCHED_MASK }, 57 { "flow_mask", TOK_FLOW_MASK }, 58 { "droptail", TOK_DROPTAIL }, 59 { "red", TOK_RED }, 60 { "gred", TOK_GRED }, 61 { "bw", TOK_BW }, 62 { "bandwidth", TOK_BW }, 63 { "delay", TOK_DELAY }, 64 { "link", TOK_LINK }, 65 { "pipe", TOK_PIPE }, 66 { "queue", TOK_QUEUE }, 67 { "flowset", TOK_FLOWSET }, 68 { "sched", TOK_SCHED }, 69 { "pri", TOK_PRI }, 70 { "priority", TOK_PRI }, 71 { "type", TOK_TYPE }, 72 { "flow-id", TOK_FLOWID}, 73 { "dst-ipv6", TOK_DSTIP6}, 74 { "dst-ip6", TOK_DSTIP6}, 75 { "src-ipv6", TOK_SRCIP6}, 76 { "src-ip6", TOK_SRCIP6}, 77 { "profile", TOK_PROFILE}, 78 { "burst", TOK_BURST}, 79 { "dummynet-params", TOK_NULL }, 80 { NULL, 0 } /* terminator */ 81}; 82 83#define O_NEXT(p, len) ((void *)((char *)p + len)) 84 85static void 86oid_fill(struct dn_id *oid, int len, int type, uintptr_t id) 87{ 88 oid->len = len; 89 oid->type = type; 90 oid->subtype = 0; 91 oid->id = id; 92} 93 94/* make room in the buffer and move the pointer forward */ 95static void * 96o_next(struct dn_id **o, int len, int type) 97{ 98 struct dn_id *ret = *o; 99 oid_fill(ret, len, type, 0); 100 *o = O_NEXT(*o, len); 101 return ret; 102} 103 104#if 0 105static int 106sort_q(void *arg, const void *pa, const void *pb) 107{ 108 int rev = (co.do_sort < 0); 109 int field = rev ? -co.do_sort : co.do_sort; 110 long long res = 0; 111 const struct dn_flow_queue *a = pa; 112 const struct dn_flow_queue *b = pb; 113 114 switch (field) { 115 case 1: /* pkts */ 116 res = a->len - b->len; 117 break; 118 case 2: /* bytes */ 119 res = a->len_bytes - b->len_bytes; 120 break; 121 122 case 3: /* tot pkts */ 123 res = a->tot_pkts - b->tot_pkts; 124 break; 125 126 case 4: /* tot bytes */ 127 res = a->tot_bytes - b->tot_bytes; 128 break; 129 } 130 if (res < 0) 131 res = -1; 132 if (res > 0) 133 res = 1; 134 return (int)(rev ? res : -res); 135} 136#endif 137 138/* print a mask and header for the subsequent list of flows */ 139static void 140print_mask(struct ipfw_flow_id *id) 141{ 142 if (!IS_IP6_FLOW_ID(id)) { 143 printf(" " 144 "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n", 145 id->proto, 146 id->src_ip, id->src_port, 147 id->dst_ip, id->dst_port); 148 149 printf("BKT Prot ___Source IP/port____ " 150 "____Dest. IP/port____ " 151 "Tot_pkt/bytes Pkt/Byte Drp\n"); 152 } else { 153 char buf[255]; 154 printf("\n mask: proto: 0x%02x, flow_id: 0x%08x, ", 155 id->proto, id->flow_id6); 156 inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf)); 157 printf("%s/0x%04x -> ", buf, id->src_port); 158 inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf)); 159 printf("%s/0x%04x\n", buf, id->dst_port); 160 161 printf("BKT ___Prot___ _flow-id_ " 162 "______________Source IPv6/port_______________ " 163 "_______________Dest. IPv6/port_______________ " 164 "Tot_pkt/bytes Pkt/Byte Drp\n"); 165 } 166} 167 168static void 169list_flow(struct dn_flow *ni) 170{ 171 char buff[255]; 172 struct protoent *pe; 173 struct in_addr ina; 174 struct ipfw_flow_id *id = &ni->fid; 175 176 pe = getprotobynumber(id->proto); 177 /* XXX: Should check for IPv4 flows */ 178 printf("%3u ", (ni->oid.id) & 0xff); 179 if (!IS_IP6_FLOW_ID(id)) { 180 if (pe) 181 printf("%-4s ", pe->p_name); 182 else 183 printf("%4u ", id->proto); 184 ina.s_addr = htonl(id->src_ip); 185 printf("%15s/%-5d ", 186 inet_ntoa(ina), id->src_port); 187 ina.s_addr = htonl(id->dst_ip); 188 printf("%15s/%-5d ", 189 inet_ntoa(ina), id->dst_port); 190 } else { 191 /* Print IPv6 flows */ 192 if (pe != NULL) 193 printf("%9s ", pe->p_name); 194 else 195 printf("%9u ", id->proto); 196 printf("%7d %39s/%-5d ", id->flow_id6, 197 inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)), 198 id->src_port); 199 printf(" %39s/%-5d ", 200 inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)), 201 id->dst_port); 202 } 203 printf("%4llu %8llu %2u %4u %3u\n", 204 align_uint64(&ni->tot_pkts), 205 align_uint64(&ni->tot_bytes), 206 ni->length, ni->len_bytes, ni->drops); 207} 208 209static void 210print_flowset_parms(struct dn_fs *fs, char *prefix) 211{ 212 int l; 213 char qs[30]; 214 char plr[30]; 215 char red[90]; /* Display RED parameters */ 216 217 l = fs->qsize; 218 if (fs->flags & DN_QSIZE_BYTES) { 219 if (l >= 8192) 220 sprintf(qs, "%d KB", l / 1024); 221 else 222 sprintf(qs, "%d B", l); 223 } else 224 sprintf(qs, "%3d sl.", l); 225 if (fs->plr) 226 sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff)); 227 else 228 plr[0] = '\0'; 229 230 if (fs->flags & DN_IS_RED) /* RED parameters */ 231 sprintf(red, 232 "\n\t %cRED w_q %f min_th %d max_th %d max_p %f", 233 (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ', 234 1.0 * fs->w_q / (double)(1 << SCALE_RED), 235 fs->min_th, 236 fs->max_th, 237 1.0 * fs->max_p / (double)(1 << SCALE_RED)); 238 else 239 sprintf(red, "droptail"); 240 241 if (prefix[0]) { 242 printf("%s %s%s %d queues (%d buckets) %s\n", 243 prefix, qs, plr, fs->oid.id, fs->buckets, red); 244 prefix[0] = '\0'; 245 } else { 246 printf("q%05d %s%s %d flows (%d buckets) sched %d " 247 "weight %d lmax %d pri %d %s\n", 248 fs->fs_nr, qs, plr, fs->oid.id, fs->buckets, 249 fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red); 250 if (fs->flags & DN_HAVE_MASK) 251 print_mask(&fs->flow_mask); 252 } 253} 254 255static void 256print_extra_delay_parms(struct dn_profile *p) 257{ 258 double loss; 259 if (p->samples_no <= 0) 260 return; 261 262 loss = p->loss_level; 263 loss /= p->samples_no; 264 printf("\t profile: name \"%s\" loss %f samples %d\n", 265 p->name, loss, p->samples_no); 266} 267 268static void 269flush_buf(char *buf) 270{ 271 if (buf[0]) 272 printf("%s\n", buf); 273 buf[0] = '\0'; 274} 275 276/* 277 * generic list routine. We expect objects in a specific order, i.e. 278 * PIPES AND SCHEDULERS: 279 * link; scheduler; internal flowset if any; instances 280 * we can tell a pipe from the number. 281 * 282 * FLOWSETS: 283 * flowset; queues; 284 * link i (int queue); scheduler i; si(i) { flowsets() : queues } 285 */ 286static void 287list_pipes(struct dn_id *oid, struct dn_id *end) 288{ 289 char buf[160]; /* pending buffer */ 290 buf[0] = '\0'; 291 292 for (; oid != end; oid = O_NEXT(oid, oid->len)) { 293 if (oid->len < sizeof(*oid)) 294 errx(1, "invalid oid len %d\n", oid->len); 295 296 switch (oid->type) { 297 default: 298 flush_buf(buf); 299 printf("unrecognized object %d size %d\n", oid->type, oid->len); 300 break; 301 case DN_TEXT: /* list of attached flowsets */ 302 { 303 int i, l; 304 struct { 305 struct dn_id id; 306 uint32_t p[0]; 307 } *d = (void *)oid; 308 l = (oid->len - sizeof(*oid))/sizeof(d->p[0]); 309 if (l == 0) 310 break; 311 printf(" Children flowsets: "); 312 for (i = 0; i < l; i++) 313 printf("%u ", d->p[i]); 314 printf("\n"); 315 break; 316 } 317 case DN_CMD_GET: 318 if (co.verbose) 319 printf("answer for cmd %d, len %d\n", oid->type, oid->id); 320 break; 321 case DN_SCH: { 322 struct dn_sch *s = (struct dn_sch *)oid; 323 flush_buf(buf); 324 printf(" sched %d type %s flags 0x%x %d buckets %d active\n", 325 s->sched_nr, 326 s->name, s->flags, s->buckets, s->oid.id); 327 if (s->flags & DN_HAVE_MASK) 328 print_mask(&s->sched_mask); 329 } 330 break; 331 332 case DN_FLOW: 333 list_flow((struct dn_flow *)oid); 334 break; 335 336 case DN_LINK: { 337 struct dn_link *p = (struct dn_link *)oid; 338 double b = p->bandwidth; 339 char bwbuf[30]; 340 char burst[5 + 7]; 341 342 /* This starts a new object so flush buffer */ 343 flush_buf(buf); 344 /* data rate */ 345 if (b == 0) 346 sprintf(bwbuf, "unlimited "); 347 else if (b >= 1000000) 348 sprintf(bwbuf, "%7.3f Mbit/s", b/1000000); 349 else if (b >= 1000) 350 sprintf(bwbuf, "%7.3f Kbit/s", b/1000); 351 else 352 sprintf(bwbuf, "%7.3f bit/s ", b); 353 354 if (humanize_number(burst, sizeof(burst), p->burst, 355 "", HN_AUTOSCALE, 0) < 0 || co.verbose) 356 sprintf(burst, "%d", (int)p->burst); 357 sprintf(buf, "%05d: %s %4d ms burst %s", 358 p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst); 359 } 360 break; 361 362 case DN_FS: 363 print_flowset_parms((struct dn_fs *)oid, buf); 364 break; 365 case DN_PROFILE: 366 flush_buf(buf); 367 print_extra_delay_parms((struct dn_profile *)oid); 368 } 369 flush_buf(buf); // XXX does it really go here ? 370 } 371} 372 373/* 374 * Delete pipe, queue or scheduler i 375 */ 376int 377ipfw_delete_pipe(int do_pipe, int i) 378{ 379 struct { 380 struct dn_id oid; 381 uintptr_t a[1]; /* add more if we want a list */ 382 } cmd; 383 oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION); 384 cmd.oid.subtype = (do_pipe == 1) ? DN_LINK : 385 ( (do_pipe == 2) ? DN_FS : DN_SCH); 386 cmd.a[0] = i; 387 i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len); 388 if (i) { 389 i = 1; 390 warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i); 391 } 392 return i; 393} 394 395/* 396 * Code to parse delay profiles. 397 * 398 * Some link types introduce extra delays in the transmission 399 * of a packet, e.g. because of MAC level framing, contention on 400 * the use of the channel, MAC level retransmissions and so on. 401 * From our point of view, the channel is effectively unavailable 402 * for this extra time, which is constant or variable depending 403 * on the link type. Additionally, packets may be dropped after this 404 * time (e.g. on a wireless link after too many retransmissions). 405 * We can model the additional delay with an empirical curve 406 * that represents its distribution. 407 * 408 * cumulative probability 409 * 1.0 ^ 410 * | 411 * L +-- loss-level x 412 * | ****** 413 * | * 414 * | ***** 415 * | * 416 * | ** 417 * | * 418 * +-------*-------------------> 419 * delay 420 * 421 * The empirical curve may have both vertical and horizontal lines. 422 * Vertical lines represent constant delay for a range of 423 * probabilities; horizontal lines correspond to a discontinuty 424 * in the delay distribution: the link will use the largest delay 425 * for a given probability. 426 * 427 * To pass the curve to dummynet, we must store the parameters 428 * in a file as described below, and issue the command 429 * 430 * ipfw pipe <n> config ... bw XXX profile <filename> ... 431 * 432 * The file format is the following, with whitespace acting as 433 * a separator and '#' indicating the beginning a comment: 434 * 435 * samples N 436 * the number of samples used in the internal 437 * representation (2..1024; default 100); 438 * 439 * loss-level L 440 * The probability above which packets are lost. 441 * (0.0 <= L <= 1.0, default 1.0 i.e. no loss); 442 * 443 * name identifier 444 * Optional a name (listed by "ipfw pipe show") 445 * to identify the distribution; 446 * 447 * "delay prob" | "prob delay" 448 * One of these two lines is mandatory and defines 449 * the format of the following lines with data points. 450 * 451 * XXX YYY 452 * 2 or more lines representing points in the curve, 453 * with either delay or probability first, according 454 * to the chosen format. 455 * The unit for delay is milliseconds. 456 * 457 * Data points does not need to be ordered or equal to the number 458 * specified in the "samples" line. ipfw will sort and interpolate 459 * the curve as needed. 460 * 461 * Example of a profile file: 462 463 name bla_bla_bla 464 samples 100 465 loss-level 0.86 466 prob delay 467 0 200 # minimum overhead is 200ms 468 0.5 200 469 0.5 300 470 0.8 1000 471 0.9 1300 472 1 1300 473 474 * Internally, we will convert the curve to a fixed number of 475 * samples, and when it is time to transmit a packet we will 476 * model the extra delay as extra bits in the packet. 477 * 478 */ 479 480#define ED_MAX_LINE_LEN 256+ED_MAX_NAME_LEN 481#define ED_TOK_SAMPLES "samples" 482#define ED_TOK_LOSS "loss-level" 483#define ED_TOK_NAME "name" 484#define ED_TOK_DELAY "delay" 485#define ED_TOK_PROB "prob" 486#define ED_TOK_BW "bw" 487#define ED_SEPARATORS " \t\n" 488#define ED_MIN_SAMPLES_NO 2 489 490/* 491 * returns 1 if s is a non-negative number, with at least one '.' 492 */ 493static int 494is_valid_number(const char *s) 495{ 496 int i, dots_found = 0; 497 int len = strlen(s); 498 499 for (i = 0; i<len; ++i) 500 if (!isdigit(s[i]) && (s[i] !='.' || ++dots_found > 1)) 501 return 0; 502 return 1; 503} 504 505/* 506 * Take as input a string describing a bandwidth value 507 * and return the numeric bandwidth value. 508 * set clocking interface or bandwidth value 509 */ 510static void 511read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen) 512{ 513 if (*bandwidth != -1) 514 warnx("duplicate token, override bandwidth value!"); 515 516 if (arg[0] >= 'a' && arg[0] <= 'z') { 517 if (!if_name) { 518 errx(1, "no if support"); 519 } 520 if (namelen >= IFNAMSIZ) 521 warn("interface name truncated"); 522 namelen--; 523 /* interface name */ 524 strncpy(if_name, arg, namelen); 525 if_name[namelen] = '\0'; 526 *bandwidth = 0; 527 } else { /* read bandwidth value */ 528 int bw; 529 char *end = NULL; 530 531 bw = strtoul(arg, &end, 0); 532 if (*end == 'K' || *end == 'k') { 533 end++; 534 bw *= 1000; 535 } else if (*end == 'M') { 536 end++; 537 bw *= 1000000; 538 } 539 if ((*end == 'B' && 540 _substrcmp2(end, "Bi", "Bit/s") != 0) || 541 _substrcmp2(end, "by", "bytes") == 0) 542 bw *= 8; 543 544 if (bw < 0) 545 errx(EX_DATAERR, "bandwidth too large"); 546 547 *bandwidth = bw; 548 if (if_name) 549 if_name[0] = '\0'; 550 } 551} 552 553struct point { 554 double prob; 555 double delay; 556}; 557 558static int 559compare_points(const void *vp1, const void *vp2) 560{ 561 const struct point *p1 = vp1; 562 const struct point *p2 = vp2; 563 double res = 0; 564 565 res = p1->prob - p2->prob; 566 if (res == 0) 567 res = p1->delay - p2->delay; 568 if (res < 0) 569 return -1; 570 else if (res > 0) 571 return 1; 572 else 573 return 0; 574} 575 576#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno 577 578static void 579load_extra_delays(const char *filename, struct dn_profile *p, 580 struct dn_link *link) 581{ 582 char line[ED_MAX_LINE_LEN]; 583 FILE *f; 584 int lineno = 0; 585 int i; 586 587 int samples = -1; 588 double loss = -1.0; 589 char profile_name[ED_MAX_NAME_LEN]; 590 int delay_first = -1; 591 int do_points = 0; 592 struct point points[ED_MAX_SAMPLES_NO]; 593 int points_no = 0; 594 595 /* XXX link never NULL? */ 596 p->link_nr = link->link_nr; 597 598 profile_name[0] = '\0'; 599 f = fopen(filename, "r"); 600 if (f == NULL) 601 err(EX_UNAVAILABLE, "fopen: %s", filename); 602 603 while (fgets(line, ED_MAX_LINE_LEN, f)) { /* read commands */ 604 char *s, *cur = line, *name = NULL, *arg = NULL; 605 606 ++lineno; 607 608 /* parse the line */ 609 while (cur) { 610 s = strsep(&cur, ED_SEPARATORS); 611 if (s == NULL || *s == '#') 612 break; 613 if (*s == '\0') 614 continue; 615 if (arg) 616 errx(ED_EFMT("too many arguments")); 617 if (name == NULL) 618 name = s; 619 else 620 arg = s; 621 } 622 if (name == NULL) /* empty line */ 623 continue; 624 if (arg == NULL) 625 errx(ED_EFMT("missing arg for %s"), name); 626 627 if (!strcasecmp(name, ED_TOK_SAMPLES)) { 628 if (samples > 0) 629 errx(ED_EFMT("duplicate ``samples'' line")); 630 if (atoi(arg) <=0) 631 errx(ED_EFMT("invalid number of samples")); 632 samples = atoi(arg); 633 if (samples>ED_MAX_SAMPLES_NO) 634 errx(ED_EFMT("too many samples, maximum is %d"), 635 ED_MAX_SAMPLES_NO); 636 do_points = 0; 637 } else if (!strcasecmp(name, ED_TOK_BW)) { 638 char buf[IFNAMSIZ]; 639 read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf)); 640 } else if (!strcasecmp(name, ED_TOK_LOSS)) { 641 if (loss != -1.0) 642 errx(ED_EFMT("duplicated token: %s"), name); 643 if (!is_valid_number(arg)) 644 errx(ED_EFMT("invalid %s"), arg); 645 loss = atof(arg); 646 if (loss > 1) 647 errx(ED_EFMT("%s greater than 1.0"), name); 648 do_points = 0; 649 } else if (!strcasecmp(name, ED_TOK_NAME)) { 650 if (profile_name[0] != '\0') 651 errx(ED_EFMT("duplicated token: %s"), name); 652 strncpy(profile_name, arg, sizeof(profile_name) - 1); 653 profile_name[sizeof(profile_name)-1] = '\0'; 654 do_points = 0; 655 } else if (!strcasecmp(name, ED_TOK_DELAY)) { 656 if (do_points) 657 errx(ED_EFMT("duplicated token: %s"), name); 658 delay_first = 1; 659 do_points = 1; 660 } else if (!strcasecmp(name, ED_TOK_PROB)) { 661 if (do_points) 662 errx(ED_EFMT("duplicated token: %s"), name); 663 delay_first = 0; 664 do_points = 1; 665 } else if (do_points) { 666 if (!is_valid_number(name) || !is_valid_number(arg)) 667 errx(ED_EFMT("invalid point found")); 668 if (delay_first) { 669 points[points_no].delay = atof(name); 670 points[points_no].prob = atof(arg); 671 } else { 672 points[points_no].delay = atof(arg); 673 points[points_no].prob = atof(name); 674 } 675 if (points[points_no].prob > 1.0) 676 errx(ED_EFMT("probability greater than 1.0")); 677 ++points_no; 678 } else { 679 errx(ED_EFMT("unrecognised command '%s'"), name); 680 } 681 } 682 683 fclose (f); 684 685 if (samples == -1) { 686 warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES); 687 samples = 100; 688 } 689 690 if (loss == -1.0) { 691 warnx("'%s' not found, assuming no loss", ED_TOK_LOSS); 692 loss = 1; 693 } 694 695 /* make sure that there are enough points. */ 696 if (points_no < ED_MIN_SAMPLES_NO) 697 errx(ED_EFMT("too few samples, need at least %d"), 698 ED_MIN_SAMPLES_NO); 699 700 qsort(points, points_no, sizeof(struct point), compare_points); 701 702 /* interpolation */ 703 for (i = 0; i<points_no-1; ++i) { 704 double y1 = points[i].prob * samples; 705 double x1 = points[i].delay; 706 double y2 = points[i+1].prob * samples; 707 double x2 = points[i+1].delay; 708 709 int ix = y1; 710 int stop = y2; 711 712 if (x1 == x2) { 713 for (; ix<stop; ++ix) 714 p->samples[ix] = x1; 715 } else { 716 double m = (y2-y1)/(x2-x1); 717 double c = y1 - m*x1; 718 for (; ix<stop ; ++ix) 719 p->samples[ix] = (ix - c)/m; 720 } 721 } 722 p->samples_no = samples; 723 p->loss_level = loss * samples; 724 strncpy(p->name, profile_name, sizeof(p->name)); 725} 726 727/* 728 * configuration of pipes, schedulers, flowsets. 729 * When we configure a new scheduler, an empty pipe is created, so: 730 * 731 * do_pipe = 1 -> "pipe N config ..." only for backward compatibility 732 * sched N+Delta type fifo sched_mask ... 733 * pipe N+Delta <parameters> 734 * flowset N+Delta pipe N+Delta (no parameters) 735 * sched N type wf2q+ sched_mask ... 736 * pipe N <parameters> 737 * 738 * do_pipe = 2 -> flowset N config 739 * flowset N parameters 740 * 741 * do_pipe = 3 -> sched N config 742 * sched N parameters (default no pipe) 743 * optional Pipe N config ... 744 * pipe ==> 745 */ 746void 747ipfw_config_pipe(int ac, char **av) 748{ 749 int i, j; 750 char *end; 751 void *par = NULL; 752 struct dn_id *buf, *base; 753 struct dn_sch *sch = NULL; 754 struct dn_link *p = NULL; 755 struct dn_fs *fs = NULL; 756 struct dn_profile *pf = NULL; 757 struct ipfw_flow_id *mask = NULL; 758 int lmax; 759 uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo; 760 761 /* 762 * allocate space for 1 header, 763 * 1 scheduler, 1 link, 1 flowset, 1 profile 764 */ 765 lmax = sizeof(struct dn_id); /* command header */ 766 lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) + 767 sizeof(struct dn_fs) + sizeof(struct dn_profile); 768 769 av++; ac--; 770 /* Pipe number */ 771 if (ac && isdigit(**av)) { 772 i = atoi(*av); av++; ac--; 773 } else 774 i = -1; 775 if (i <= 0) 776 errx(EX_USAGE, "need a pipe/flowset/sched number"); 777 base = buf = safe_calloc(1, lmax); 778 /* all commands start with a 'CONFIGURE' and a version */ 779 o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG); 780 base->id = DN_API_VERSION; 781 782 switch (co.do_pipe) { 783 case 1: /* "pipe N config ..." */ 784 /* Allocate space for the WF2Q+ scheduler, its link 785 * and the FIFO flowset. Set the number, but leave 786 * the scheduler subtype and other parameters to 0 787 * so the kernel will use appropriate defaults. 788 * XXX todo: add a flag to record if a parameter 789 * is actually configured. 790 * If we do a 'pipe config' mask -> sched_mask. 791 * The FIFO scheduler and link are derived from the 792 * WF2Q+ one in the kernel. 793 */ 794 sch = o_next(&buf, sizeof(*sch), DN_SCH); 795 p = o_next(&buf, sizeof(*p), DN_LINK); 796 fs = o_next(&buf, sizeof(*fs), DN_FS); 797 798 sch->sched_nr = i; 799 sch->oid.subtype = 0; /* defaults to WF2Q+ */ 800 mask = &sch->sched_mask; 801 flags = &sch->flags; 802 buckets = &sch->buckets; 803 *flags |= DN_PIPE_CMD; 804 805 p->link_nr = i; 806 807 /* This flowset is only for the FIFO scheduler */ 808 fs->fs_nr = i + 2*DN_MAX_ID; 809 fs->sched_nr = i + DN_MAX_ID; 810 break; 811 812 case 2: /* "queue N config ... " */ 813 fs = o_next(&buf, sizeof(*fs), DN_FS); 814 fs->fs_nr = i; 815 mask = &fs->flow_mask; 816 flags = &fs->flags; 817 buckets = &fs->buckets; 818 break; 819 820 case 3: /* "sched N config ..." */ 821 sch = o_next(&buf, sizeof(*sch), DN_SCH); 822 fs = o_next(&buf, sizeof(*fs), DN_FS); 823 sch->sched_nr = i; 824 mask = &sch->sched_mask; 825 flags = &sch->flags; 826 buckets = &sch->buckets; 827 /* fs is used only with !MULTIQUEUE schedulers */ 828 fs->fs_nr = i + DN_MAX_ID; 829 fs->sched_nr = i; 830 break; 831 } 832 /* set to -1 those fields for which we want to reuse existing 833 * values from the kernel. 834 * Also, *_nr and subtype = 0 mean reuse the value from the kernel. 835 * XXX todo: support reuse of the mask. 836 */ 837 if (p) 838 p->bandwidth = -1; 839 for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++) 840 fs->par[j] = -1; 841 while (ac > 0) { 842 double d; 843 int tok = match_token(dummynet_params, *av); 844 ac--; av++; 845 846 switch(tok) { 847 case TOK_NOERROR: 848 NEED(fs, "noerror is only for pipes"); 849 fs->flags |= DN_NOERROR; 850 break; 851 852 case TOK_PLR: 853 NEED(fs, "plr is only for pipes"); 854 NEED1("plr needs argument 0..1\n"); 855 d = strtod(av[0], NULL); 856 if (d > 1) 857 d = 1; 858 else if (d < 0) 859 d = 0; 860 fs->plr = (int)(d*0x7fffffff); 861 ac--; av++; 862 break; 863 864 case TOK_QUEUE: 865 NEED(fs, "queue is only for pipes or flowsets"); 866 NEED1("queue needs queue size\n"); 867 end = NULL; 868 fs->qsize = strtoul(av[0], &end, 0); 869 if (*end == 'K' || *end == 'k') { 870 fs->flags |= DN_QSIZE_BYTES; 871 fs->qsize *= 1024; 872 } else if (*end == 'B' || 873 _substrcmp2(end, "by", "bytes") == 0) { 874 fs->flags |= DN_QSIZE_BYTES; 875 } 876 ac--; av++; 877 break; 878 879 case TOK_BUCKETS: 880 NEED(fs, "buckets is only for pipes or flowsets"); 881 NEED1("buckets needs argument\n"); 882 *buckets = strtoul(av[0], NULL, 0); 883 ac--; av++; 884 break; 885 886 case TOK_FLOW_MASK: 887 case TOK_SCHED_MASK: 888 case TOK_MASK: 889 NEED(mask, "tok_mask"); 890 NEED1("mask needs mask specifier\n"); 891 /* 892 * per-flow queue, mask is dst_ip, dst_port, 893 * src_ip, src_port, proto measured in bits 894 */ 895 par = NULL; 896 897 bzero(mask, sizeof(*mask)); 898 end = NULL; 899 900 while (ac >= 1) { 901 uint32_t *p32 = NULL; 902 uint16_t *p16 = NULL; 903 uint32_t *p20 = NULL; 904 struct in6_addr *pa6 = NULL; 905 uint32_t a; 906 907 tok = match_token(dummynet_params, *av); 908 ac--; av++; 909 switch(tok) { 910 case TOK_ALL: 911 /* 912 * special case, all bits significant 913 */ 914 mask->dst_ip = ~0; 915 mask->src_ip = ~0; 916 mask->dst_port = ~0; 917 mask->src_port = ~0; 918 mask->proto = ~0; 919 n2mask(&mask->dst_ip6, 128); 920 n2mask(&mask->src_ip6, 128); 921 mask->flow_id6 = ~0; 922 *flags |= DN_HAVE_MASK; 923 goto end_mask; 924 925 case TOK_DSTIP: 926 mask->addr_type = 4; 927 p32 = &mask->dst_ip; 928 break; 929 930 case TOK_SRCIP: 931 mask->addr_type = 4; 932 p32 = &mask->src_ip; 933 break; 934 935 case TOK_DSTIP6: 936 mask->addr_type = 6; 937 pa6 = &mask->dst_ip6; 938 break; 939 940 case TOK_SRCIP6: 941 mask->addr_type = 6; 942 pa6 = &mask->src_ip6; 943 break; 944 945 case TOK_FLOWID: 946 mask->addr_type = 6; 947 p20 = &mask->flow_id6; 948 break; 949 950 case TOK_DSTPORT: 951 p16 = &mask->dst_port; 952 break; 953 954 case TOK_SRCPORT: 955 p16 = &mask->src_port; 956 break; 957 958 case TOK_PROTO: 959 break; 960 961 default: 962 ac++; av--; /* backtrack */ 963 goto end_mask; 964 } 965 if (ac < 1) 966 errx(EX_USAGE, "mask: value missing"); 967 if (*av[0] == '/') { 968 a = strtoul(av[0]+1, &end, 0); 969 if (pa6 == NULL) 970 a = (a == 32) ? ~0 : (1 << a) - 1; 971 } else 972 a = strtoul(av[0], &end, 0); 973 if (p32 != NULL) 974 *p32 = a; 975 else if (p16 != NULL) { 976 if (a > 0xFFFF) 977 errx(EX_DATAERR, 978 "port mask must be 16 bit"); 979 *p16 = (uint16_t)a; 980 } else if (p20 != NULL) { 981 if (a > 0xfffff) 982 errx(EX_DATAERR, 983 "flow_id mask must be 20 bit"); 984 *p20 = (uint32_t)a; 985 } else if (pa6 != NULL) { 986 if (a > 128) 987 errx(EX_DATAERR, 988 "in6addr invalid mask len"); 989 else 990 n2mask(pa6, a); 991 } else { 992 if (a > 0xFF) 993 errx(EX_DATAERR, 994 "proto mask must be 8 bit"); 995 fs->flow_mask.proto = (uint8_t)a; 996 } 997 if (a != 0) 998 *flags |= DN_HAVE_MASK; 999 ac--; av++; 1000 } /* end while, config masks */ 1001end_mask: 1002 break; 1003 1004 case TOK_RED: 1005 case TOK_GRED: 1006 NEED1("red/gred needs w_q/min_th/max_th/max_p\n"); 1007 fs->flags |= DN_IS_RED; 1008 if (tok == TOK_GRED) 1009 fs->flags |= DN_IS_GENTLE_RED; 1010 /* 1011 * the format for parameters is w_q/min_th/max_th/max_p 1012 */ 1013 if ((end = strsep(&av[0], "/"))) { 1014 double w_q = strtod(end, NULL); 1015 if (w_q > 1 || w_q <= 0) 1016 errx(EX_DATAERR, "0 < w_q <= 1"); 1017 fs->w_q = (int) (w_q * (1 << SCALE_RED)); 1018 } 1019 if ((end = strsep(&av[0], "/"))) { 1020 fs->min_th = strtoul(end, &end, 0); 1021 if (*end == 'K' || *end == 'k') 1022 fs->min_th *= 1024; 1023 } 1024 if ((end = strsep(&av[0], "/"))) { 1025 fs->max_th = strtoul(end, &end, 0); 1026 if (*end == 'K' || *end == 'k') 1027 fs->max_th *= 1024; 1028 } 1029 if ((end = strsep(&av[0], "/"))) { 1030 double max_p = strtod(end, NULL); 1031 if (max_p > 1 || max_p <= 0) 1032 errx(EX_DATAERR, "0 < max_p <= 1"); 1033 fs->max_p = (int)(max_p * (1 << SCALE_RED)); 1034 } 1035 ac--; av++; 1036 break; 1037 1038 case TOK_DROPTAIL: 1039 NEED(fs, "droptail is only for flowsets"); 1040 fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED); 1041 break; 1042 1043 case TOK_BW: 1044 NEED(p, "bw is only for links"); 1045 NEED1("bw needs bandwidth or interface\n"); 1046 read_bandwidth(av[0], &p->bandwidth, NULL, 0); 1047 ac--; av++; 1048 break; 1049 1050 case TOK_DELAY: 1051 NEED(p, "delay is only for links"); 1052 NEED1("delay needs argument 0..10000ms\n"); 1053 p->delay = strtoul(av[0], NULL, 0); 1054 ac--; av++; 1055 break; 1056 1057 case TOK_TYPE: { 1058 int l; 1059 NEED(sch, "type is only for schedulers"); 1060 NEED1("type needs a string"); 1061 l = strlen(av[0]); 1062 if (l == 0 || l > 15) 1063 errx(1, "type %s too long\n", av[0]); 1064 strcpy(sch->name, av[0]); 1065 sch->oid.subtype = 0; /* use string */ 1066 ac--; av++; 1067 break; 1068 } 1069 1070 case TOK_WEIGHT: 1071 NEED(fs, "weight is only for flowsets"); 1072 NEED1("weight needs argument\n"); 1073 fs->par[0] = strtol(av[0], &end, 0); 1074 ac--; av++; 1075 break; 1076 1077 case TOK_LMAX: 1078 NEED(fs, "lmax is only for flowsets"); 1079 NEED1("lmax needs argument\n"); 1080 fs->par[1] = strtol(av[0], &end, 0); 1081 ac--; av++; 1082 break; 1083 1084 case TOK_PRI: 1085 NEED(fs, "priority is only for flowsets"); 1086 NEED1("priority needs argument\n"); 1087 fs->par[2] = strtol(av[0], &end, 0); 1088 ac--; av++; 1089 break; 1090 1091 case TOK_SCHED: 1092 case TOK_PIPE: 1093 NEED(fs, "pipe/sched"); 1094 NEED1("pipe/link/sched needs number\n"); 1095 fs->sched_nr = strtoul(av[0], &end, 0); 1096 ac--; av++; 1097 break; 1098 1099 case TOK_PROFILE: 1100 NEED((!pf), "profile already set"); 1101 NEED(p, "profile"); 1102 { 1103 NEED1("extra delay needs the file name\n"); 1104 pf = o_next(&buf, sizeof(*pf), DN_PROFILE); 1105 load_extra_delays(av[0], pf, p); //XXX can't fail? 1106 --ac; ++av; 1107 } 1108 break; 1109 1110 case TOK_BURST: 1111 NEED(p, "burst"); 1112 NEED1("burst needs argument\n"); 1113 errno = 0; 1114 if (expand_number(av[0], (int64_t *)&p->burst) < 0) 1115 if (errno != ERANGE) 1116 errx(EX_DATAERR, 1117 "burst: invalid argument"); 1118 if (errno || p->burst > (1ULL << 48) - 1) 1119 errx(EX_DATAERR, 1120 "burst: out of range (0..2^48-1)"); 1121 ac--; av++; 1122 break; 1123 1124 default: 1125 errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]); 1126 } 1127 } 1128 1129 /* check validity of parameters */ 1130 if (p) { 1131 if (p->delay > 10000) 1132 errx(EX_DATAERR, "delay must be < 10000"); 1133 if (p->bandwidth == -1) 1134 p->bandwidth = 0; 1135 } 1136 if (fs) { 1137 /* XXX accept a 0 scheduler to keep the default */ 1138 if (fs->flags & DN_QSIZE_BYTES) { 1139 size_t len; 1140 long limit; 1141 1142 len = sizeof(limit); 1143 if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit", 1144 &limit, &len, NULL, 0) == -1) 1145 limit = 1024*1024; 1146 if (fs->qsize > limit) 1147 errx(EX_DATAERR, "queue size must be < %ldB", limit); 1148 } else { 1149 size_t len; 1150 long limit; 1151 1152 len = sizeof(limit); 1153 if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit", 1154 &limit, &len, NULL, 0) == -1) 1155 limit = 100; 1156 if (fs->qsize > limit) 1157 errx(EX_DATAERR, "2 <= queue size <= %ld", limit); 1158 } 1159 1160 if (fs->flags & DN_IS_RED) { 1161 size_t len; 1162 int lookup_depth, avg_pkt_size; 1163 double w_q; 1164 1165 if (fs->min_th >= fs->max_th) 1166 errx(EX_DATAERR, "min_th %d must be < than max_th %d", 1167 fs->min_th, fs->max_th); 1168 if (fs->max_th == 0) 1169 errx(EX_DATAERR, "max_th must be > 0"); 1170 1171 len = sizeof(int); 1172 if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth", 1173 &lookup_depth, &len, NULL, 0) == -1) 1174 lookup_depth = 256; 1175 if (lookup_depth == 0) 1176 errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth" 1177 " must be greater than zero"); 1178 1179 len = sizeof(int); 1180 if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size", 1181 &avg_pkt_size, &len, NULL, 0) == -1) 1182 avg_pkt_size = 512; 1183 1184 if (avg_pkt_size == 0) 1185 errx(EX_DATAERR, 1186 "net.inet.ip.dummynet.red_avg_pkt_size must" 1187 " be greater than zero"); 1188 1189 /* 1190 * Ticks needed for sending a medium-sized packet. 1191 * Unfortunately, when we are configuring a WF2Q+ queue, we 1192 * do not have bandwidth information, because that is stored 1193 * in the parent pipe, and also we have multiple queues 1194 * competing for it. So we set s=0, which is not very 1195 * correct. But on the other hand, why do we want RED with 1196 * WF2Q+ ? 1197 */ 1198#if 0 1199 if (p.bandwidth==0) /* this is a WF2Q+ queue */ 1200 s = 0; 1201 else 1202 s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth; 1203#endif 1204 /* 1205 * max idle time (in ticks) before avg queue size becomes 0. 1206 * NOTA: (3/w_q) is approx the value x so that 1207 * (1-w_q)^x < 10^-3. 1208 */ 1209 w_q = ((double)fs->w_q) / (1 << SCALE_RED); 1210#if 0 // go in kernel 1211 idle = s * 3. / w_q; 1212 fs->lookup_step = (int)idle / lookup_depth; 1213 if (!fs->lookup_step) 1214 fs->lookup_step = 1; 1215 weight = 1 - w_q; 1216 for (t = fs->lookup_step; t > 1; --t) 1217 weight *= 1 - w_q; 1218 fs->lookup_weight = (int)(weight * (1 << SCALE_RED)); 1219#endif 1220 } 1221 } 1222 1223 i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base); 1224 1225 if (i) 1226 err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE"); 1227} 1228 1229void 1230dummynet_flush(void) 1231{ 1232 struct dn_id oid; 1233 oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION); 1234 do_cmd(IP_DUMMYNET3, &oid, oid.len); 1235} 1236 1237/* Parse input for 'ipfw [pipe|sched|queue] show [range list]' 1238 * Returns the number of ranges, and possibly stores them 1239 * in the array v of size len. 1240 */ 1241static int 1242parse_range(int ac, char *av[], uint32_t *v, int len) 1243{ 1244 int n = 0; 1245 char *endptr, *s; 1246 uint32_t base[2]; 1247 1248 if (v == NULL || len < 2) { 1249 v = base; 1250 len = 2; 1251 } 1252 1253 for (s = *av; s != NULL; av++, ac--) { 1254 v[0] = strtoul(s, &endptr, 10); 1255 v[1] = (*endptr != '-') ? v[0] : 1256 strtoul(endptr+1, &endptr, 10); 1257 if (*endptr == '\0') { /* prepare for next round */ 1258 s = (ac > 0) ? *(av+1) : NULL; 1259 } else { 1260 if (*endptr != ',') { 1261 warn("invalid number: %s", s); 1262 s = ++endptr; 1263 continue; 1264 } 1265 /* continue processing from here */ 1266 s = ++endptr; 1267 ac++; 1268 av--; 1269 } 1270 if (v[1] < v[0] || 1271 v[1] < 0 || v[1] >= DN_MAX_ID-1 || 1272 v[0] < 0 || v[1] >= DN_MAX_ID-1) { 1273 continue; /* invalid entry */ 1274 } 1275 n++; 1276 /* translate if 'pipe list' */ 1277 if (co.do_pipe == 1) { 1278 v[0] += DN_MAX_ID; 1279 v[1] += DN_MAX_ID; 1280 } 1281 v = (n*2 < len) ? v + 2 : base; 1282 } 1283 return n; 1284} 1285 1286/* main entry point for dummynet list functions. co.do_pipe indicates 1287 * which function we want to support. 1288 * av may contain filtering arguments, either individual entries 1289 * or ranges, or lists (space or commas are valid separators). 1290 * Format for a range can be n1-n2 or n3 n4 n5 ... 1291 * In a range n1 must be <= n2, otherwise the range is ignored. 1292 * A number 'n4' is translate in a range 'n4-n4' 1293 * All number must be > 0 and < DN_MAX_ID-1 1294 */ 1295void 1296dummynet_list(int ac, char *av[], int show_counters) 1297{ 1298 struct dn_id *oid, *x = NULL; 1299 int ret, i, l; 1300 int n; /* # of ranges */ 1301 int buflen; 1302 int max_size; /* largest obj passed up */ 1303 1304 ac--; 1305 av++; /* skip 'list' | 'show' word */ 1306 1307 n = parse_range(ac, av, NULL, 0); /* Count # of ranges. */ 1308 1309 /* Allocate space to store ranges */ 1310 l = sizeof(*oid) + sizeof(uint32_t) * n * 2; 1311 oid = safe_calloc(1, l); 1312 oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION); 1313 1314 if (n > 0) /* store ranges in idx */ 1315 parse_range(ac, av, (uint32_t *)(oid + 1), n*2); 1316 /* 1317 * Compute the size of the largest object returned. If the 1318 * response leaves at least this much spare space in the 1319 * buffer, then surely the response is complete; otherwise 1320 * there might be a risk of truncation and we will need to 1321 * retry with a larger buffer. 1322 * XXX don't bother with smaller structs. 1323 */ 1324 max_size = sizeof(struct dn_fs); 1325 if (max_size < sizeof(struct dn_sch)) 1326 max_size = sizeof(struct dn_sch); 1327 if (max_size < sizeof(struct dn_flow)) 1328 max_size = sizeof(struct dn_flow); 1329 1330 switch (co.do_pipe) { 1331 case 1: 1332 oid->subtype = DN_LINK; /* list pipe */ 1333 break; 1334 case 2: 1335 oid->subtype = DN_FS; /* list queue */ 1336 break; 1337 case 3: 1338 oid->subtype = DN_SCH; /* list sched */ 1339 break; 1340 } 1341 1342 /* 1343 * Ask the kernel an estimate of the required space (result 1344 * in oid.id), unless we are requesting a subset of objects, 1345 * in which case the kernel does not give an exact answer. 1346 * In any case, space might grow in the meantime due to the 1347 * creation of new queues, so we must be prepared to retry. 1348 */ 1349 if (n > 0) { 1350 buflen = 4*1024; 1351 } else { 1352 ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l); 1353 if (ret != 0 || oid->id <= sizeof(*oid)) 1354 goto done; 1355 buflen = oid->id + max_size; 1356 oid->len = sizeof(*oid); /* restore */ 1357 } 1358 /* Try a few times, until the buffer fits */ 1359 for (i = 0; i < 20; i++) { 1360 l = buflen; 1361 x = safe_realloc(x, l); 1362 bcopy(oid, x, oid->len); 1363 ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l); 1364 if (ret != 0 || x->id <= sizeof(*oid)) 1365 goto done; /* no response */ 1366 if (l + max_size <= buflen) 1367 break; /* ok */ 1368 buflen *= 2; /* double for next attempt */ 1369 } 1370 list_pipes(x, O_NEXT(x, l)); 1371done: 1372 if (x) 1373 free(x); 1374 free(oid); 1375} 1376