dummynet.c revision 205050
1/*
2 * Copyright (c) 2002-2003,2010 Luigi Rizzo
3 *
4 * Redistribution and use in source forms, with and without modification,
5 * are permitted provided that this entire comment appears intact.
6 *
7 * Redistribution in binary form may occur without any restrictions.
8 * Obviously, it would be nice if you gave credit where credit is due
9 * but requiring it would be too onerous.
10 *
11 * This software is provided ``AS IS'' without any warranties of any kind.
12 *
13 * $FreeBSD: head/sbin/ipfw/dummynet.c 205050 2010-03-11 22:42:33Z luigi $
14 *
15 * dummynet support
16 */
17
18#include <sys/types.h>
19#include <sys/socket.h>
20/* XXX there are several sysctl leftover here */
21#include <sys/sysctl.h>
22
23#include "ipfw2.h"
24
25#include <ctype.h>
26#include <err.h>
27#include <errno.h>
28#include <libutil.h>
29#include <netdb.h>
30#include <stdio.h>
31#include <stdlib.h>
32#include <string.h>
33#include <sysexits.h>
34
35#include <net/if.h>
36#include <netinet/in.h>
37#include <netinet/ip_fw.h>
38#include <netinet/ip_dummynet.h>
39#include <arpa/inet.h>	/* inet_ntoa */
40
41
42static struct _s_x dummynet_params[] = {
43	{ "plr",		TOK_PLR },
44	{ "noerror",		TOK_NOERROR },
45	{ "buckets",		TOK_BUCKETS },
46	{ "dst-ip",		TOK_DSTIP },
47	{ "src-ip",		TOK_SRCIP },
48	{ "dst-port",		TOK_DSTPORT },
49	{ "src-port",		TOK_SRCPORT },
50	{ "proto",		TOK_PROTO },
51	{ "weight",		TOK_WEIGHT },
52	{ "lmax",		TOK_LMAX },
53	{ "maxlen",		TOK_LMAX },
54	{ "all",		TOK_ALL },
55	{ "mask",		TOK_MASK }, /* alias for both */
56	{ "sched_mask",		TOK_SCHED_MASK },
57	{ "flow_mask",		TOK_FLOW_MASK },
58	{ "droptail",		TOK_DROPTAIL },
59	{ "red",		TOK_RED },
60	{ "gred",		TOK_GRED },
61	{ "bw",			TOK_BW },
62	{ "bandwidth",		TOK_BW },
63	{ "delay",		TOK_DELAY },
64	{ "link",		TOK_LINK },
65	{ "pipe",		TOK_PIPE },
66	{ "queue",		TOK_QUEUE },
67	{ "flowset",		TOK_FLOWSET },
68	{ "sched",		TOK_SCHED },
69	{ "pri",		TOK_PRI },
70	{ "priority",		TOK_PRI },
71	{ "type",		TOK_TYPE },
72	{ "flow-id",		TOK_FLOWID},
73	{ "dst-ipv6",		TOK_DSTIP6},
74	{ "dst-ip6",		TOK_DSTIP6},
75	{ "src-ipv6",		TOK_SRCIP6},
76	{ "src-ip6",		TOK_SRCIP6},
77	{ "profile",		TOK_PROFILE},
78	{ "burst",		TOK_BURST},
79	{ "dummynet-params",	TOK_NULL },
80	{ NULL, 0 }	/* terminator */
81};
82
83#define O_NEXT(p, len) ((void *)((char *)p + len))
84
85static void
86oid_fill(struct dn_id *oid, int len, int type, uintptr_t id)
87{
88	oid->len = len;
89	oid->type = type;
90	oid->subtype = 0;
91	oid->id = id;
92}
93
94/* make room in the buffer and move the pointer forward */
95static void *
96o_next(struct dn_id **o, int len, int type)
97{
98	struct dn_id *ret = *o;
99	oid_fill(ret, len, type, 0);
100	*o = O_NEXT(*o, len);
101	return ret;
102}
103
104#if 0
105static int
106sort_q(void *arg, const void *pa, const void *pb)
107{
108	int rev = (co.do_sort < 0);
109	int field = rev ? -co.do_sort : co.do_sort;
110	long long res = 0;
111	const struct dn_flow_queue *a = pa;
112	const struct dn_flow_queue *b = pb;
113
114	switch (field) {
115	case 1: /* pkts */
116		res = a->len - b->len;
117		break;
118	case 2: /* bytes */
119		res = a->len_bytes - b->len_bytes;
120		break;
121
122	case 3: /* tot pkts */
123		res = a->tot_pkts - b->tot_pkts;
124		break;
125
126	case 4: /* tot bytes */
127		res = a->tot_bytes - b->tot_bytes;
128		break;
129	}
130	if (res < 0)
131		res = -1;
132	if (res > 0)
133		res = 1;
134	return (int)(rev ? res : -res);
135}
136#endif
137
138/* print a mask and header for the subsequent list of flows */
139static void
140print_mask(struct ipfw_flow_id *id)
141{
142	if (!IS_IP6_FLOW_ID(id)) {
143		printf("    "
144		    "mask: 0x%02x 0x%08x/0x%04x -> 0x%08x/0x%04x\n",
145		    id->proto,
146		    id->src_ip, id->src_port,
147		    id->dst_ip, id->dst_port);
148
149		printf("BKT Prot ___Source IP/port____ "
150		    "____Dest. IP/port____ "
151		    "Tot_pkt/bytes Pkt/Byte Drp\n");
152	} else {
153		char buf[255];
154		printf("\n        mask: proto: 0x%02x, flow_id: 0x%08x,  ",
155		    id->proto, id->flow_id6);
156		inet_ntop(AF_INET6, &(id->src_ip6), buf, sizeof(buf));
157		printf("%s/0x%04x -> ", buf, id->src_port);
158		inet_ntop(AF_INET6, &(id->dst_ip6), buf, sizeof(buf));
159		printf("%s/0x%04x\n", buf, id->dst_port);
160
161		printf("BKT ___Prot___ _flow-id_ "
162		    "______________Source IPv6/port_______________ "
163		    "_______________Dest. IPv6/port_______________ "
164		    "Tot_pkt/bytes Pkt/Byte Drp\n");
165	}
166}
167
168static void
169list_flow(struct dn_flow *ni)
170{
171	char buff[255];
172	struct protoent *pe;
173	struct in_addr ina;
174	struct ipfw_flow_id *id = &ni->fid;
175
176	pe = getprotobynumber(id->proto);
177		/* XXX: Should check for IPv4 flows */
178	printf("%3u ", (ni->oid.id) & 0xff);
179	if (!IS_IP6_FLOW_ID(id)) {
180		if (pe)
181			printf("%-4s ", pe->p_name);
182		else
183			printf("%4u ", id->proto);
184		ina.s_addr = htonl(id->src_ip);
185		printf("%15s/%-5d ",
186		    inet_ntoa(ina), id->src_port);
187		ina.s_addr = htonl(id->dst_ip);
188		printf("%15s/%-5d ",
189		    inet_ntoa(ina), id->dst_port);
190	} else {
191		/* Print IPv6 flows */
192		if (pe != NULL)
193			printf("%9s ", pe->p_name);
194		else
195			printf("%9u ", id->proto);
196		printf("%7d  %39s/%-5d ", id->flow_id6,
197		    inet_ntop(AF_INET6, &(id->src_ip6), buff, sizeof(buff)),
198		    id->src_port);
199		printf(" %39s/%-5d ",
200		    inet_ntop(AF_INET6, &(id->dst_ip6), buff, sizeof(buff)),
201		    id->dst_port);
202	}
203	printf("%4llu %8llu %2u %4u %3u\n",
204	    align_uint64(&ni->tot_pkts),
205	    align_uint64(&ni->tot_bytes),
206	    ni->length, ni->len_bytes, ni->drops);
207}
208
209static void
210print_flowset_parms(struct dn_fs *fs, char *prefix)
211{
212	int l;
213	char qs[30];
214	char plr[30];
215	char red[90];	/* Display RED parameters */
216
217	l = fs->qsize;
218	if (fs->flags & DN_QSIZE_BYTES) {
219		if (l >= 8192)
220			sprintf(qs, "%d KB", l / 1024);
221		else
222			sprintf(qs, "%d B", l);
223	} else
224		sprintf(qs, "%3d sl.", l);
225	if (fs->plr)
226		sprintf(plr, "plr %f", 1.0 * fs->plr / (double)(0x7fffffff));
227	else
228		plr[0] = '\0';
229
230	if (fs->flags & DN_IS_RED)	/* RED parameters */
231		sprintf(red,
232		    "\n\t %cRED w_q %f min_th %d max_th %d max_p %f",
233		    (fs->flags & DN_IS_GENTLE_RED) ? 'G' : ' ',
234		    1.0 * fs->w_q / (double)(1 << SCALE_RED),
235		    fs->min_th,
236		    fs->max_th,
237		    1.0 * fs->max_p / (double)(1 << SCALE_RED));
238	else
239		sprintf(red, "droptail");
240
241	if (prefix[0]) {
242	    printf("%s %s%s %d queues (%d buckets) %s\n",
243		prefix, qs, plr, fs->oid.id, fs->buckets, red);
244	    prefix[0] = '\0';
245	} else {
246	    printf("q%05d %s%s %d flows (%d buckets) sched %d "
247			"weight %d lmax %d pri %d %s\n",
248		fs->fs_nr, qs, plr, fs->oid.id, fs->buckets,
249		fs->sched_nr, fs->par[0], fs->par[1], fs->par[2], red);
250	    if (fs->flags & DN_HAVE_MASK)
251		print_mask(&fs->flow_mask);
252	}
253}
254
255static void
256print_extra_delay_parms(struct dn_profile *p)
257{
258	double loss;
259	if (p->samples_no <= 0)
260		return;
261
262	loss = p->loss_level;
263	loss /= p->samples_no;
264	printf("\t profile: name \"%s\" loss %f samples %d\n",
265		p->name, loss, p->samples_no);
266}
267
268static void
269flush_buf(char *buf)
270{
271	if (buf[0])
272		printf("%s\n", buf);
273	buf[0] = '\0';
274}
275
276/*
277 * generic list routine. We expect objects in a specific order, i.e.
278 * PIPES AND SCHEDULERS:
279 *	link; scheduler; internal flowset if any; instances
280 * we can tell a pipe from the number.
281 *
282 * FLOWSETS:
283 *	flowset; queues;
284 * link i (int queue); scheduler i; si(i) { flowsets() : queues }
285 */
286static void
287list_pipes(struct dn_id *oid, struct dn_id *end)
288{
289    char buf[160];	/* pending buffer */
290    buf[0] = '\0';
291
292    for (; oid != end; oid = O_NEXT(oid, oid->len)) {
293	if (oid->len < sizeof(*oid))
294		errx(1, "invalid oid len %d\n", oid->len);
295
296	switch (oid->type) {
297	default:
298	    flush_buf(buf);
299	    printf("unrecognized object %d size %d\n", oid->type, oid->len);
300	    break;
301	case DN_TEXT: /* list of attached flowsets */
302	    {
303		int i, l;
304		struct {
305			struct dn_id id;
306			uint32_t p[0];
307		} *d = (void *)oid;
308		l = (oid->len - sizeof(*oid))/sizeof(d->p[0]);
309		if (l == 0)
310		    break;
311		printf("   Children flowsets: ");
312		for (i = 0; i < l; i++)
313			printf("%u ", d->p[i]);
314		printf("\n");
315		break;
316	    }
317	case DN_CMD_GET:
318	    if (co.verbose)
319		printf("answer for cmd %d, len %d\n", oid->type, oid->id);
320	    break;
321	case DN_SCH: {
322	    struct dn_sch *s = (struct dn_sch *)oid;
323	    flush_buf(buf);
324	    printf(" sched %d type %s flags 0x%x %d buckets %d active\n",
325			s->sched_nr,
326			s->name, s->flags, s->buckets, s->oid.id);
327	    if (s->flags & DN_HAVE_MASK)
328		print_mask(&s->sched_mask);
329	    }
330	    break;
331
332	case DN_FLOW:
333	    list_flow((struct dn_flow *)oid);
334	    break;
335
336	case DN_LINK: {
337	    struct dn_link *p = (struct dn_link *)oid;
338	    double b = p->bandwidth;
339	    char bwbuf[30];
340	    char burst[5 + 7];
341
342	    /* This starts a new object so flush buffer */
343	    flush_buf(buf);
344	    /* data rate */
345	    if (b == 0)
346		sprintf(bwbuf, "unlimited     ");
347	    else if (b >= 1000000)
348		sprintf(bwbuf, "%7.3f Mbit/s", b/1000000);
349	    else if (b >= 1000)
350		sprintf(bwbuf, "%7.3f Kbit/s", b/1000);
351	    else
352		sprintf(bwbuf, "%7.3f bit/s ", b);
353
354	    if (humanize_number(burst, sizeof(burst), p->burst,
355		    "", HN_AUTOSCALE, 0) < 0 || co.verbose)
356		sprintf(burst, "%d", (int)p->burst);
357	    sprintf(buf, "%05d: %s %4d ms burst %s",
358		p->link_nr % DN_MAX_ID, bwbuf, p->delay, burst);
359	    }
360	    break;
361
362	case DN_FS:
363	    print_flowset_parms((struct dn_fs *)oid, buf);
364	    break;
365	case DN_PROFILE:
366	    flush_buf(buf);
367	    print_extra_delay_parms((struct dn_profile *)oid);
368	}
369	flush_buf(buf); // XXX does it really go here ?
370    }
371}
372
373/*
374 * Delete pipe, queue or scheduler i
375 */
376int
377ipfw_delete_pipe(int do_pipe, int i)
378{
379	struct {
380		struct dn_id oid;
381		uintptr_t a[1];	/* add more if we want a list */
382	} cmd;
383	oid_fill((void *)&cmd, sizeof(cmd), DN_CMD_DELETE, DN_API_VERSION);
384	cmd.oid.subtype = (do_pipe == 1) ? DN_LINK :
385		( (do_pipe == 2) ? DN_FS : DN_SCH);
386	cmd.a[0] = i;
387	i = do_cmd(IP_DUMMYNET3, &cmd, cmd.oid.len);
388	if (i) {
389		i = 1;
390		warn("rule %u: setsockopt(IP_DUMMYNET_DEL)", i);
391	}
392	return i;
393}
394
395/*
396 * Code to parse delay profiles.
397 *
398 * Some link types introduce extra delays in the transmission
399 * of a packet, e.g. because of MAC level framing, contention on
400 * the use of the channel, MAC level retransmissions and so on.
401 * From our point of view, the channel is effectively unavailable
402 * for this extra time, which is constant or variable depending
403 * on the link type. Additionally, packets may be dropped after this
404 * time (e.g. on a wireless link after too many retransmissions).
405 * We can model the additional delay with an empirical curve
406 * that represents its distribution.
407 *
408 *	cumulative probability
409 *	1.0 ^
410 *	    |
411 *	L   +-- loss-level          x
412 *	    |                 ******
413 *	    |                *
414 *	    |           *****
415 *	    |          *
416 *	    |        **
417 *	    |       *
418 *	    +-------*------------------->
419 *			delay
420 *
421 * The empirical curve may have both vertical and horizontal lines.
422 * Vertical lines represent constant delay for a range of
423 * probabilities; horizontal lines correspond to a discontinuty
424 * in the delay distribution: the link will use the largest delay
425 * for a given probability.
426 *
427 * To pass the curve to dummynet, we must store the parameters
428 * in a file as described below, and issue the command
429 *
430 *      ipfw pipe <n> config ... bw XXX profile <filename> ...
431 *
432 * The file format is the following, with whitespace acting as
433 * a separator and '#' indicating the beginning a comment:
434 *
435 *	samples N
436 *		the number of samples used in the internal
437 *		representation (2..1024; default 100);
438 *
439 *	loss-level L
440 *		The probability above which packets are lost.
441 *               (0.0 <= L <= 1.0, default 1.0 i.e. no loss);
442 *
443 *	name identifier
444 *		Optional a name (listed by "ipfw pipe show")
445 *		to identify the distribution;
446 *
447 *	"delay prob" | "prob delay"
448 *		One of these two lines is mandatory and defines
449 *		the format of the following lines with data points.
450 *
451 *	XXX YYY
452 *		2 or more lines representing points in the curve,
453 *		with either delay or probability first, according
454 *		to the chosen format.
455 *		The unit for delay is milliseconds.
456 *
457 * Data points does not need to be ordered or equal to the number
458 * specified in the "samples" line. ipfw will sort and interpolate
459 * the curve as needed.
460 *
461 * Example of a profile file:
462
463        name    bla_bla_bla
464        samples 100
465        loss-level    0.86
466        prob    delay
467        0       200	# minimum overhead is 200ms
468        0.5     200
469        0.5     300
470        0.8     1000
471        0.9     1300
472        1       1300
473
474 * Internally, we will convert the curve to a fixed number of
475 * samples, and when it is time to transmit a packet we will
476 * model the extra delay as extra bits in the packet.
477 *
478 */
479
480#define ED_MAX_LINE_LEN	256+ED_MAX_NAME_LEN
481#define ED_TOK_SAMPLES	"samples"
482#define ED_TOK_LOSS	"loss-level"
483#define ED_TOK_NAME	"name"
484#define ED_TOK_DELAY	"delay"
485#define ED_TOK_PROB	"prob"
486#define ED_TOK_BW	"bw"
487#define ED_SEPARATORS	" \t\n"
488#define ED_MIN_SAMPLES_NO	2
489
490/*
491 * returns 1 if s is a non-negative number, with at least one '.'
492 */
493static int
494is_valid_number(const char *s)
495{
496	int i, dots_found = 0;
497	int len = strlen(s);
498
499	for (i = 0; i<len; ++i)
500		if (!isdigit(s[i]) && (s[i] !='.' || ++dots_found > 1))
501			return 0;
502	return 1;
503}
504
505/*
506 * Take as input a string describing a bandwidth value
507 * and return the numeric bandwidth value.
508 * set clocking interface or bandwidth value
509 */
510static void
511read_bandwidth(char *arg, int *bandwidth, char *if_name, int namelen)
512{
513	if (*bandwidth != -1)
514		warnx("duplicate token, override bandwidth value!");
515
516	if (arg[0] >= 'a' && arg[0] <= 'z') {
517		if (!if_name) {
518			errx(1, "no if support");
519		}
520		if (namelen >= IFNAMSIZ)
521			warn("interface name truncated");
522		namelen--;
523		/* interface name */
524		strncpy(if_name, arg, namelen);
525		if_name[namelen] = '\0';
526		*bandwidth = 0;
527	} else {	/* read bandwidth value */
528		int bw;
529		char *end = NULL;
530
531		bw = strtoul(arg, &end, 0);
532		if (*end == 'K' || *end == 'k') {
533			end++;
534			bw *= 1000;
535		} else if (*end == 'M') {
536			end++;
537			bw *= 1000000;
538		}
539		if ((*end == 'B' &&
540			_substrcmp2(end, "Bi", "Bit/s") != 0) ||
541		    _substrcmp2(end, "by", "bytes") == 0)
542			bw *= 8;
543
544		if (bw < 0)
545			errx(EX_DATAERR, "bandwidth too large");
546
547		*bandwidth = bw;
548		if (if_name)
549			if_name[0] = '\0';
550	}
551}
552
553struct point {
554	double prob;
555	double delay;
556};
557
558static int
559compare_points(const void *vp1, const void *vp2)
560{
561	const struct point *p1 = vp1;
562	const struct point *p2 = vp2;
563	double res = 0;
564
565	res = p1->prob - p2->prob;
566	if (res == 0)
567		res = p1->delay - p2->delay;
568	if (res < 0)
569		return -1;
570	else if (res > 0)
571		return 1;
572	else
573		return 0;
574}
575
576#define ED_EFMT(s) EX_DATAERR,"error in %s at line %d: "#s,filename,lineno
577
578static void
579load_extra_delays(const char *filename, struct dn_profile *p,
580	struct dn_link *link)
581{
582	char    line[ED_MAX_LINE_LEN];
583	FILE    *f;
584	int     lineno = 0;
585	int     i;
586
587	int     samples = -1;
588	double  loss = -1.0;
589	char    profile_name[ED_MAX_NAME_LEN];
590	int     delay_first = -1;
591	int     do_points = 0;
592	struct point    points[ED_MAX_SAMPLES_NO];
593	int     points_no = 0;
594
595	/* XXX link never NULL? */
596	p->link_nr = link->link_nr;
597
598	profile_name[0] = '\0';
599	f = fopen(filename, "r");
600	if (f == NULL)
601		err(EX_UNAVAILABLE, "fopen: %s", filename);
602
603	while (fgets(line, ED_MAX_LINE_LEN, f)) {         /* read commands */
604		char *s, *cur = line, *name = NULL, *arg = NULL;
605
606		++lineno;
607
608		/* parse the line */
609		while (cur) {
610			s = strsep(&cur, ED_SEPARATORS);
611			if (s == NULL || *s == '#')
612				break;
613			if (*s == '\0')
614				continue;
615			if (arg)
616				errx(ED_EFMT("too many arguments"));
617			if (name == NULL)
618				name = s;
619			else
620				arg = s;
621		}
622		if (name == NULL)	/* empty line */
623			continue;
624		if (arg == NULL)
625			errx(ED_EFMT("missing arg for %s"), name);
626
627		if (!strcasecmp(name, ED_TOK_SAMPLES)) {
628		    if (samples > 0)
629			errx(ED_EFMT("duplicate ``samples'' line"));
630		    if (atoi(arg) <=0)
631			errx(ED_EFMT("invalid number of samples"));
632		    samples = atoi(arg);
633		    if (samples>ED_MAX_SAMPLES_NO)
634			    errx(ED_EFMT("too many samples, maximum is %d"),
635				ED_MAX_SAMPLES_NO);
636		    do_points = 0;
637		} else if (!strcasecmp(name, ED_TOK_BW)) {
638		    char buf[IFNAMSIZ];
639		    read_bandwidth(arg, &link->bandwidth, buf, sizeof(buf));
640		} else if (!strcasecmp(name, ED_TOK_LOSS)) {
641		    if (loss != -1.0)
642			errx(ED_EFMT("duplicated token: %s"), name);
643		    if (!is_valid_number(arg))
644			errx(ED_EFMT("invalid %s"), arg);
645		    loss = atof(arg);
646		    if (loss > 1)
647			errx(ED_EFMT("%s greater than 1.0"), name);
648		    do_points = 0;
649		} else if (!strcasecmp(name, ED_TOK_NAME)) {
650		    if (profile_name[0] != '\0')
651			errx(ED_EFMT("duplicated token: %s"), name);
652		    strncpy(profile_name, arg, sizeof(profile_name) - 1);
653		    profile_name[sizeof(profile_name)-1] = '\0';
654		    do_points = 0;
655		} else if (!strcasecmp(name, ED_TOK_DELAY)) {
656		    if (do_points)
657			errx(ED_EFMT("duplicated token: %s"), name);
658		    delay_first = 1;
659		    do_points = 1;
660		} else if (!strcasecmp(name, ED_TOK_PROB)) {
661		    if (do_points)
662			errx(ED_EFMT("duplicated token: %s"), name);
663		    delay_first = 0;
664		    do_points = 1;
665		} else if (do_points) {
666		    if (!is_valid_number(name) || !is_valid_number(arg))
667			errx(ED_EFMT("invalid point found"));
668		    if (delay_first) {
669			points[points_no].delay = atof(name);
670			points[points_no].prob = atof(arg);
671		    } else {
672			points[points_no].delay = atof(arg);
673			points[points_no].prob = atof(name);
674		    }
675		    if (points[points_no].prob > 1.0)
676			errx(ED_EFMT("probability greater than 1.0"));
677		    ++points_no;
678		} else {
679		    errx(ED_EFMT("unrecognised command '%s'"), name);
680		}
681	}
682
683	fclose (f);
684
685	if (samples == -1) {
686	    warnx("'%s' not found, assuming 100", ED_TOK_SAMPLES);
687	    samples = 100;
688	}
689
690	if (loss == -1.0) {
691	    warnx("'%s' not found, assuming no loss", ED_TOK_LOSS);
692	    loss = 1;
693	}
694
695	/* make sure that there are enough points. */
696	if (points_no < ED_MIN_SAMPLES_NO)
697	    errx(ED_EFMT("too few samples, need at least %d"),
698		ED_MIN_SAMPLES_NO);
699
700	qsort(points, points_no, sizeof(struct point), compare_points);
701
702	/* interpolation */
703	for (i = 0; i<points_no-1; ++i) {
704	    double y1 = points[i].prob * samples;
705	    double x1 = points[i].delay;
706	    double y2 = points[i+1].prob * samples;
707	    double x2 = points[i+1].delay;
708
709	    int ix = y1;
710	    int stop = y2;
711
712	    if (x1 == x2) {
713		for (; ix<stop; ++ix)
714		    p->samples[ix] = x1;
715	    } else {
716		double m = (y2-y1)/(x2-x1);
717		double c = y1 - m*x1;
718		for (; ix<stop ; ++ix)
719		    p->samples[ix] = (ix - c)/m;
720	    }
721	}
722	p->samples_no = samples;
723	p->loss_level = loss * samples;
724	strncpy(p->name, profile_name, sizeof(p->name));
725}
726
727/*
728 * configuration of pipes, schedulers, flowsets.
729 * When we configure a new scheduler, an empty pipe is created, so:
730 *
731 * do_pipe = 1 -> "pipe N config ..." only for backward compatibility
732 *	sched N+Delta type fifo sched_mask ...
733 *	pipe N+Delta <parameters>
734 *	flowset N+Delta pipe N+Delta (no parameters)
735 *	sched N type wf2q+ sched_mask ...
736 *	pipe N <parameters>
737 *
738 * do_pipe = 2 -> flowset N config
739 *	flowset N parameters
740 *
741 * do_pipe = 3 -> sched N config
742 *	sched N parameters (default no pipe)
743 *	optional Pipe N config ...
744 * pipe ==>
745 */
746void
747ipfw_config_pipe(int ac, char **av)
748{
749	int i, j;
750	char *end;
751	void *par = NULL;
752	struct dn_id *buf, *base;
753	struct dn_sch *sch = NULL;
754	struct dn_link *p = NULL;
755	struct dn_fs *fs = NULL;
756	struct dn_profile *pf = NULL;
757	struct ipfw_flow_id *mask = NULL;
758	int lmax;
759	uint32_t _foo = 0, *flags = &_foo , *buckets = &_foo;
760
761	/*
762	 * allocate space for 1 header,
763	 * 1 scheduler, 1 link, 1 flowset, 1 profile
764	 */
765	lmax = sizeof(struct dn_id);	/* command header */
766	lmax += sizeof(struct dn_sch) + sizeof(struct dn_link) +
767		sizeof(struct dn_fs) + sizeof(struct dn_profile);
768
769	av++; ac--;
770	/* Pipe number */
771	if (ac && isdigit(**av)) {
772		i = atoi(*av); av++; ac--;
773	} else
774		i = -1;
775	if (i <= 0)
776		errx(EX_USAGE, "need a pipe/flowset/sched number");
777	base = buf = safe_calloc(1, lmax);
778	/* all commands start with a 'CONFIGURE' and a version */
779	o_next(&buf, sizeof(struct dn_id), DN_CMD_CONFIG);
780	base->id = DN_API_VERSION;
781
782	switch (co.do_pipe) {
783	case 1: /* "pipe N config ..." */
784		/* Allocate space for the WF2Q+ scheduler, its link
785		 * and the FIFO flowset. Set the number, but leave
786		 * the scheduler subtype and other parameters to 0
787		 * so the kernel will use appropriate defaults.
788		 * XXX todo: add a flag to record if a parameter
789		 * is actually configured.
790		 * If we do a 'pipe config' mask -> sched_mask.
791		 * The FIFO scheduler and link are derived from the
792		 * WF2Q+ one in the kernel.
793		 */
794		sch = o_next(&buf, sizeof(*sch), DN_SCH);
795		p = o_next(&buf, sizeof(*p), DN_LINK);
796		fs = o_next(&buf, sizeof(*fs), DN_FS);
797
798		sch->sched_nr = i;
799		sch->oid.subtype = 0;	/* defaults to WF2Q+ */
800		mask = &sch->sched_mask;
801		flags = &sch->flags;
802		buckets = &sch->buckets;
803		*flags |= DN_PIPE_CMD;
804
805		p->link_nr = i;
806
807		/* This flowset is only for the FIFO scheduler */
808		fs->fs_nr = i + 2*DN_MAX_ID;
809		fs->sched_nr = i + DN_MAX_ID;
810		break;
811
812	case 2: /* "queue N config ... " */
813		fs = o_next(&buf, sizeof(*fs), DN_FS);
814		fs->fs_nr = i;
815		mask = &fs->flow_mask;
816		flags = &fs->flags;
817		buckets = &fs->buckets;
818		break;
819
820	case 3: /* "sched N config ..." */
821		sch = o_next(&buf, sizeof(*sch), DN_SCH);
822		fs = o_next(&buf, sizeof(*fs), DN_FS);
823		sch->sched_nr = i;
824		mask = &sch->sched_mask;
825		flags = &sch->flags;
826		buckets = &sch->buckets;
827		/* fs is used only with !MULTIQUEUE schedulers */
828		fs->fs_nr = i + DN_MAX_ID;
829		fs->sched_nr = i;
830		break;
831	}
832	/* set to -1 those fields for which we want to reuse existing
833	 * values from the kernel.
834	 * Also, *_nr and subtype = 0 mean reuse the value from the kernel.
835	 * XXX todo: support reuse of the mask.
836	 */
837	if (p)
838		p->bandwidth = -1;
839	for (j = 0; j < sizeof(fs->par)/sizeof(fs->par[0]); j++)
840		fs->par[j] = -1;
841	while (ac > 0) {
842		double d;
843		int tok = match_token(dummynet_params, *av);
844		ac--; av++;
845
846		switch(tok) {
847		case TOK_NOERROR:
848			NEED(fs, "noerror is only for pipes");
849			fs->flags |= DN_NOERROR;
850			break;
851
852		case TOK_PLR:
853			NEED(fs, "plr is only for pipes");
854			NEED1("plr needs argument 0..1\n");
855			d = strtod(av[0], NULL);
856			if (d > 1)
857				d = 1;
858			else if (d < 0)
859				d = 0;
860			fs->plr = (int)(d*0x7fffffff);
861			ac--; av++;
862			break;
863
864		case TOK_QUEUE:
865			NEED(fs, "queue is only for pipes or flowsets");
866			NEED1("queue needs queue size\n");
867			end = NULL;
868			fs->qsize = strtoul(av[0], &end, 0);
869			if (*end == 'K' || *end == 'k') {
870				fs->flags |= DN_QSIZE_BYTES;
871				fs->qsize *= 1024;
872			} else if (*end == 'B' ||
873			    _substrcmp2(end, "by", "bytes") == 0) {
874				fs->flags |= DN_QSIZE_BYTES;
875			}
876			ac--; av++;
877			break;
878
879		case TOK_BUCKETS:
880			NEED(fs, "buckets is only for pipes or flowsets");
881			NEED1("buckets needs argument\n");
882			*buckets = strtoul(av[0], NULL, 0);
883			ac--; av++;
884			break;
885
886		case TOK_FLOW_MASK:
887		case TOK_SCHED_MASK:
888		case TOK_MASK:
889			NEED(mask, "tok_mask");
890			NEED1("mask needs mask specifier\n");
891			/*
892			 * per-flow queue, mask is dst_ip, dst_port,
893			 * src_ip, src_port, proto measured in bits
894			 */
895			par = NULL;
896
897			bzero(mask, sizeof(*mask));
898			end = NULL;
899
900			while (ac >= 1) {
901			    uint32_t *p32 = NULL;
902			    uint16_t *p16 = NULL;
903			    uint32_t *p20 = NULL;
904			    struct in6_addr *pa6 = NULL;
905			    uint32_t a;
906
907			    tok = match_token(dummynet_params, *av);
908			    ac--; av++;
909			    switch(tok) {
910			    case TOK_ALL:
911				    /*
912				     * special case, all bits significant
913				     */
914				    mask->dst_ip = ~0;
915				    mask->src_ip = ~0;
916				    mask->dst_port = ~0;
917				    mask->src_port = ~0;
918				    mask->proto = ~0;
919				    n2mask(&mask->dst_ip6, 128);
920				    n2mask(&mask->src_ip6, 128);
921				    mask->flow_id6 = ~0;
922				    *flags |= DN_HAVE_MASK;
923				    goto end_mask;
924
925			    case TOK_DSTIP:
926				    mask->addr_type = 4;
927				    p32 = &mask->dst_ip;
928				    break;
929
930			    case TOK_SRCIP:
931				    mask->addr_type = 4;
932				    p32 = &mask->src_ip;
933				    break;
934
935			    case TOK_DSTIP6:
936				    mask->addr_type = 6;
937				    pa6 = &mask->dst_ip6;
938				    break;
939
940			    case TOK_SRCIP6:
941				    mask->addr_type = 6;
942				    pa6 = &mask->src_ip6;
943				    break;
944
945			    case TOK_FLOWID:
946				    mask->addr_type = 6;
947				    p20 = &mask->flow_id6;
948				    break;
949
950			    case TOK_DSTPORT:
951				    p16 = &mask->dst_port;
952				    break;
953
954			    case TOK_SRCPORT:
955				    p16 = &mask->src_port;
956				    break;
957
958			    case TOK_PROTO:
959				    break;
960
961			    default:
962				    ac++; av--; /* backtrack */
963				    goto end_mask;
964			    }
965			    if (ac < 1)
966				    errx(EX_USAGE, "mask: value missing");
967			    if (*av[0] == '/') {
968				    a = strtoul(av[0]+1, &end, 0);
969				    if (pa6 == NULL)
970					    a = (a == 32) ? ~0 : (1 << a) - 1;
971			    } else
972				    a = strtoul(av[0], &end, 0);
973			    if (p32 != NULL)
974				    *p32 = a;
975			    else if (p16 != NULL) {
976				    if (a > 0xFFFF)
977					    errx(EX_DATAERR,
978						"port mask must be 16 bit");
979				    *p16 = (uint16_t)a;
980			    } else if (p20 != NULL) {
981				    if (a > 0xfffff)
982					errx(EX_DATAERR,
983					    "flow_id mask must be 20 bit");
984				    *p20 = (uint32_t)a;
985			    } else if (pa6 != NULL) {
986				    if (a > 128)
987					errx(EX_DATAERR,
988					    "in6addr invalid mask len");
989				    else
990					n2mask(pa6, a);
991			    } else {
992				    if (a > 0xFF)
993					    errx(EX_DATAERR,
994						"proto mask must be 8 bit");
995				    fs->flow_mask.proto = (uint8_t)a;
996			    }
997			    if (a != 0)
998				    *flags |= DN_HAVE_MASK;
999			    ac--; av++;
1000			} /* end while, config masks */
1001end_mask:
1002			break;
1003
1004		case TOK_RED:
1005		case TOK_GRED:
1006			NEED1("red/gred needs w_q/min_th/max_th/max_p\n");
1007			fs->flags |= DN_IS_RED;
1008			if (tok == TOK_GRED)
1009				fs->flags |= DN_IS_GENTLE_RED;
1010			/*
1011			 * the format for parameters is w_q/min_th/max_th/max_p
1012			 */
1013			if ((end = strsep(&av[0], "/"))) {
1014			    double w_q = strtod(end, NULL);
1015			    if (w_q > 1 || w_q <= 0)
1016				errx(EX_DATAERR, "0 < w_q <= 1");
1017			    fs->w_q = (int) (w_q * (1 << SCALE_RED));
1018			}
1019			if ((end = strsep(&av[0], "/"))) {
1020			    fs->min_th = strtoul(end, &end, 0);
1021			    if (*end == 'K' || *end == 'k')
1022				fs->min_th *= 1024;
1023			}
1024			if ((end = strsep(&av[0], "/"))) {
1025			    fs->max_th = strtoul(end, &end, 0);
1026			    if (*end == 'K' || *end == 'k')
1027				fs->max_th *= 1024;
1028			}
1029			if ((end = strsep(&av[0], "/"))) {
1030			    double max_p = strtod(end, NULL);
1031			    if (max_p > 1 || max_p <= 0)
1032				errx(EX_DATAERR, "0 < max_p <= 1");
1033			    fs->max_p = (int)(max_p * (1 << SCALE_RED));
1034			}
1035			ac--; av++;
1036			break;
1037
1038		case TOK_DROPTAIL:
1039			NEED(fs, "droptail is only for flowsets");
1040			fs->flags &= ~(DN_IS_RED|DN_IS_GENTLE_RED);
1041			break;
1042
1043		case TOK_BW:
1044			NEED(p, "bw is only for links");
1045			NEED1("bw needs bandwidth or interface\n");
1046			read_bandwidth(av[0], &p->bandwidth, NULL, 0);
1047			ac--; av++;
1048			break;
1049
1050		case TOK_DELAY:
1051			NEED(p, "delay is only for links");
1052			NEED1("delay needs argument 0..10000ms\n");
1053			p->delay = strtoul(av[0], NULL, 0);
1054			ac--; av++;
1055			break;
1056
1057		case TOK_TYPE: {
1058			int l;
1059			NEED(sch, "type is only for schedulers");
1060			NEED1("type needs a string");
1061			l = strlen(av[0]);
1062			if (l == 0 || l > 15)
1063				errx(1, "type %s too long\n", av[0]);
1064			strcpy(sch->name, av[0]);
1065			sch->oid.subtype = 0; /* use string */
1066			ac--; av++;
1067			break;
1068		    }
1069
1070		case TOK_WEIGHT:
1071			NEED(fs, "weight is only for flowsets");
1072			NEED1("weight needs argument\n");
1073			fs->par[0] = strtol(av[0], &end, 0);
1074			ac--; av++;
1075			break;
1076
1077		case TOK_LMAX:
1078			NEED(fs, "lmax is only for flowsets");
1079			NEED1("lmax needs argument\n");
1080			fs->par[1] = strtol(av[0], &end, 0);
1081			ac--; av++;
1082			break;
1083
1084		case TOK_PRI:
1085			NEED(fs, "priority is only for flowsets");
1086			NEED1("priority needs argument\n");
1087			fs->par[2] = strtol(av[0], &end, 0);
1088			ac--; av++;
1089			break;
1090
1091		case TOK_SCHED:
1092		case TOK_PIPE:
1093			NEED(fs, "pipe/sched");
1094			NEED1("pipe/link/sched needs number\n");
1095			fs->sched_nr = strtoul(av[0], &end, 0);
1096			ac--; av++;
1097			break;
1098
1099		case TOK_PROFILE:
1100			NEED((!pf), "profile already set");
1101			NEED(p, "profile");
1102		    {
1103			NEED1("extra delay needs the file name\n");
1104			pf = o_next(&buf, sizeof(*pf), DN_PROFILE);
1105			load_extra_delays(av[0], pf, p); //XXX can't fail?
1106			--ac; ++av;
1107		    }
1108			break;
1109
1110		case TOK_BURST:
1111			NEED(p, "burst");
1112			NEED1("burst needs argument\n");
1113			errno = 0;
1114			if (expand_number(av[0], (int64_t *)&p->burst) < 0)
1115				if (errno != ERANGE)
1116					errx(EX_DATAERR,
1117					    "burst: invalid argument");
1118			if (errno || p->burst > (1ULL << 48) - 1)
1119				errx(EX_DATAERR,
1120				    "burst: out of range (0..2^48-1)");
1121			ac--; av++;
1122			break;
1123
1124		default:
1125			errx(EX_DATAERR, "unrecognised option ``%s''", av[-1]);
1126		}
1127	}
1128
1129	/* check validity of parameters */
1130	if (p) {
1131		if (p->delay > 10000)
1132			errx(EX_DATAERR, "delay must be < 10000");
1133		if (p->bandwidth == -1)
1134			p->bandwidth = 0;
1135	}
1136	if (fs) {
1137		/* XXX accept a 0 scheduler to keep the default */
1138	    if (fs->flags & DN_QSIZE_BYTES) {
1139		size_t len;
1140		long limit;
1141
1142		len = sizeof(limit);
1143		if (sysctlbyname("net.inet.ip.dummynet.pipe_byte_limit",
1144			&limit, &len, NULL, 0) == -1)
1145			limit = 1024*1024;
1146		if (fs->qsize > limit)
1147			errx(EX_DATAERR, "queue size must be < %ldB", limit);
1148	    } else {
1149		size_t len;
1150		long limit;
1151
1152		len = sizeof(limit);
1153		if (sysctlbyname("net.inet.ip.dummynet.pipe_slot_limit",
1154			&limit, &len, NULL, 0) == -1)
1155			limit = 100;
1156		if (fs->qsize > limit)
1157			errx(EX_DATAERR, "2 <= queue size <= %ld", limit);
1158	    }
1159
1160	    if (fs->flags & DN_IS_RED) {
1161		size_t len;
1162		int lookup_depth, avg_pkt_size;
1163		double w_q;
1164
1165		if (fs->min_th >= fs->max_th)
1166		    errx(EX_DATAERR, "min_th %d must be < than max_th %d",
1167			fs->min_th, fs->max_th);
1168		if (fs->max_th == 0)
1169		    errx(EX_DATAERR, "max_th must be > 0");
1170
1171		len = sizeof(int);
1172		if (sysctlbyname("net.inet.ip.dummynet.red_lookup_depth",
1173			&lookup_depth, &len, NULL, 0) == -1)
1174			lookup_depth = 256;
1175		if (lookup_depth == 0)
1176		    errx(EX_DATAERR, "net.inet.ip.dummynet.red_lookup_depth"
1177			" must be greater than zero");
1178
1179		len = sizeof(int);
1180		if (sysctlbyname("net.inet.ip.dummynet.red_avg_pkt_size",
1181			&avg_pkt_size, &len, NULL, 0) == -1)
1182			avg_pkt_size = 512;
1183
1184		if (avg_pkt_size == 0)
1185			errx(EX_DATAERR,
1186			    "net.inet.ip.dummynet.red_avg_pkt_size must"
1187			    " be greater than zero");
1188
1189		/*
1190		 * Ticks needed for sending a medium-sized packet.
1191		 * Unfortunately, when we are configuring a WF2Q+ queue, we
1192		 * do not have bandwidth information, because that is stored
1193		 * in the parent pipe, and also we have multiple queues
1194		 * competing for it. So we set s=0, which is not very
1195		 * correct. But on the other hand, why do we want RED with
1196		 * WF2Q+ ?
1197		 */
1198#if 0
1199		if (p.bandwidth==0) /* this is a WF2Q+ queue */
1200			s = 0;
1201		else
1202			s = (double)ck.hz * avg_pkt_size * 8 / p.bandwidth;
1203#endif
1204		/*
1205		 * max idle time (in ticks) before avg queue size becomes 0.
1206		 * NOTA:  (3/w_q) is approx the value x so that
1207		 * (1-w_q)^x < 10^-3.
1208		 */
1209		w_q = ((double)fs->w_q) / (1 << SCALE_RED);
1210#if 0 // go in kernel
1211		idle = s * 3. / w_q;
1212		fs->lookup_step = (int)idle / lookup_depth;
1213		if (!fs->lookup_step)
1214			fs->lookup_step = 1;
1215		weight = 1 - w_q;
1216		for (t = fs->lookup_step; t > 1; --t)
1217			weight *= 1 - w_q;
1218		fs->lookup_weight = (int)(weight * (1 << SCALE_RED));
1219#endif
1220	    }
1221	}
1222
1223	i = do_cmd(IP_DUMMYNET3, base, (char *)buf - (char *)base);
1224
1225	if (i)
1226		err(1, "setsockopt(%s)", "IP_DUMMYNET_CONFIGURE");
1227}
1228
1229void
1230dummynet_flush(void)
1231{
1232	struct dn_id oid;
1233	oid_fill(&oid, sizeof(oid), DN_CMD_FLUSH, DN_API_VERSION);
1234	do_cmd(IP_DUMMYNET3, &oid, oid.len);
1235}
1236
1237/* Parse input for 'ipfw [pipe|sched|queue] show [range list]'
1238 * Returns the number of ranges, and possibly stores them
1239 * in the array v of size len.
1240 */
1241static int
1242parse_range(int ac, char *av[], uint32_t *v, int len)
1243{
1244	int n = 0;
1245	char *endptr, *s;
1246	uint32_t base[2];
1247
1248	if (v == NULL || len < 2) {
1249		v = base;
1250		len = 2;
1251	}
1252
1253	for (s = *av; s != NULL; av++, ac--) {
1254		v[0] = strtoul(s, &endptr, 10);
1255		v[1] = (*endptr != '-') ? v[0] :
1256			 strtoul(endptr+1, &endptr, 10);
1257		if (*endptr == '\0') { /* prepare for next round */
1258			s = (ac > 0) ? *(av+1) : NULL;
1259		} else {
1260			if (*endptr != ',') {
1261				warn("invalid number: %s", s);
1262				s = ++endptr;
1263				continue;
1264			}
1265			/* continue processing from here */
1266			s = ++endptr;
1267			ac++;
1268			av--;
1269		}
1270		if (v[1] < v[0] ||
1271			v[1] < 0 || v[1] >= DN_MAX_ID-1 ||
1272			v[0] < 0 || v[1] >= DN_MAX_ID-1) {
1273			continue; /* invalid entry */
1274		}
1275		n++;
1276		/* translate if 'pipe list' */
1277		if (co.do_pipe == 1) {
1278			v[0] += DN_MAX_ID;
1279			v[1] += DN_MAX_ID;
1280		}
1281		v = (n*2 < len) ? v + 2 : base;
1282	}
1283	return n;
1284}
1285
1286/* main entry point for dummynet list functions. co.do_pipe indicates
1287 * which function we want to support.
1288 * av may contain filtering arguments, either individual entries
1289 * or ranges, or lists (space or commas are valid separators).
1290 * Format for a range can be n1-n2 or n3 n4 n5 ...
1291 * In a range n1 must be <= n2, otherwise the range is ignored.
1292 * A number 'n4' is translate in a range 'n4-n4'
1293 * All number must be > 0 and < DN_MAX_ID-1
1294 */
1295void
1296dummynet_list(int ac, char *av[], int show_counters)
1297{
1298	struct dn_id *oid, *x = NULL;
1299	int ret, i, l;
1300	int n; 		/* # of ranges */
1301	int buflen;
1302	int max_size;	/* largest obj passed up */
1303
1304	ac--;
1305	av++; 		/* skip 'list' | 'show' word */
1306
1307	n = parse_range(ac, av, NULL, 0);	/* Count # of ranges. */
1308
1309	/* Allocate space to store ranges */
1310	l = sizeof(*oid) + sizeof(uint32_t) * n * 2;
1311	oid = safe_calloc(1, l);
1312	oid_fill(oid, l, DN_CMD_GET, DN_API_VERSION);
1313
1314	if (n > 0)	/* store ranges in idx */
1315		parse_range(ac, av, (uint32_t *)(oid + 1), n*2);
1316	/*
1317	 * Compute the size of the largest object returned. If the
1318	 * response leaves at least this much spare space in the
1319	 * buffer, then surely the response is complete; otherwise
1320	 * there might be a risk of truncation and we will need to
1321	 * retry with a larger buffer.
1322	 * XXX don't bother with smaller structs.
1323	 */
1324	max_size = sizeof(struct dn_fs);
1325	if (max_size < sizeof(struct dn_sch))
1326		max_size = sizeof(struct dn_sch);
1327	if (max_size < sizeof(struct dn_flow))
1328		max_size = sizeof(struct dn_flow);
1329
1330	switch (co.do_pipe) {
1331	case 1:
1332		oid->subtype = DN_LINK;	/* list pipe */
1333		break;
1334	case 2:
1335		oid->subtype = DN_FS;	/* list queue */
1336		break;
1337	case 3:
1338		oid->subtype = DN_SCH;	/* list sched */
1339		break;
1340	}
1341
1342	/*
1343	 * Ask the kernel an estimate of the required space (result
1344	 * in oid.id), unless we are requesting a subset of objects,
1345	 * in which case the kernel does not give an exact answer.
1346	 * In any case, space might grow in the meantime due to the
1347	 * creation of new queues, so we must be prepared to retry.
1348	 */
1349	if (n > 0) {
1350		buflen = 4*1024;
1351	} else {
1352		ret = do_cmd(-IP_DUMMYNET3, oid, (uintptr_t)&l);
1353		if (ret != 0 || oid->id <= sizeof(*oid))
1354			goto done;
1355		buflen = oid->id + max_size;
1356		oid->len = sizeof(*oid); /* restore */
1357	}
1358	/* Try a few times, until the buffer fits */
1359	for (i = 0; i < 20; i++) {
1360		l = buflen;
1361		x = safe_realloc(x, l);
1362		bcopy(oid, x, oid->len);
1363		ret = do_cmd(-IP_DUMMYNET3, x, (uintptr_t)&l);
1364		if (ret != 0 || x->id <= sizeof(*oid))
1365			goto done; /* no response */
1366		if (l + max_size <= buflen)
1367			break; /* ok */
1368		buflen *= 2;	 /* double for next attempt */
1369	}
1370	list_pipes(x, O_NEXT(x, l));
1371done:
1372	if (x)
1373		free(x);
1374	free(oid);
1375}
1376