ip_dummynet.h revision 71137
144603Sdcs/*
244603Sdcs * Copyright (c) 1998-2000 Luigi Rizzo, Universita` di Pisa
344603Sdcs * Portions Copyright (c) 2000 Akamba Corp.
444603Sdcs * All rights reserved
544603Sdcs *
644603Sdcs * Redistribution and use in source and binary forms, with or without
744603Sdcs * modification, are permitted provided that the following conditions
844603Sdcs * are met:
944603Sdcs * 1. Redistributions of source code must retain the above copyright
1044603Sdcs *    notice, this list of conditions and the following disclaimer.
1144603Sdcs * 2. Redistributions in binary form must reproduce the above copyright
1244603Sdcs *    notice, this list of conditions and the following disclaimer in the
1344603Sdcs *    documentation and/or other materials provided with the distribution.
1444603Sdcs *
1544603Sdcs * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1644603Sdcs * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1744603Sdcs * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1844603Sdcs * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
1944603Sdcs * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2044603Sdcs * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2144603Sdcs * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2244603Sdcs * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2344603Sdcs * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2444603Sdcs * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2550477Speter * SUCH DAMAGE.
2644603Sdcs *
2761694Sdcs * $FreeBSD: head/sys/netinet/ip_dummynet.h 71137 2001-01-17 01:09:40Z luigi $
2861694Sdcs */
2961694Sdcs
3061694Sdcs#ifndef _IP_DUMMYNET_H
3161694Sdcs#define _IP_DUMMYNET_H
3261694Sdcs
3361694Sdcs/*
3461694Sdcs * Definition of dummynet data structures. In the structures, I decided
3561694Sdcs * not to use the macros in <sys/queue.h> in the hope of making the code
3661694Sdcs * easier to port to other architectures. The type of lists and queue we
3761694Sdcs * use here is pretty simple anyways.
3861379Sdcs */
3961694Sdcs
4061694Sdcs/*
4161694Sdcs * We start with a heap, which is used in the scheduler to decide when
4261694Sdcs * to transmit packets etc.
4361694Sdcs *
4461694Sdcs * The key for the heap is used for two different values:
4561694Sdcs *
4661694Sdcs * 1. timer ticks- max 10K/second, so 32 bits are enough;
4761694Sdcs *
4861694Sdcs * 2. virtual times. These increase in steps of len/x, where len is the
4961694Sdcs *    packet length, and x is either the weight of the flow, or the
5061379Sdcs *    sum of all weights.
5144603Sdcs *    If we limit to max 1000 flows and a max weight of 100, then
5244603Sdcs *    x needs 17 bits. The packet size is 16 bits, so we can easily
5344603Sdcs *    overflow if we do not allow errors.
5444603Sdcs * So we use a key "dn_key" which is 64 bits. Some macros are used to
5544603Sdcs * compare key values and handle wraparounds.
5644603Sdcs * MAX64 returns the largest of two key values.
5744603Sdcs * MY_M is used as a shift count when doing fixed point arithmetic
5844603Sdcs * (a better name would be useful...).
5961376Sdcs */
6061376Sdcstypedef u_int64_t dn_key ;      /* sorting key */
6161376Sdcs#define DN_KEY_LT(a,b)     ((int64_t)((a)-(b)) < 0)
6261376Sdcs#define DN_KEY_LEQ(a,b)    ((int64_t)((a)-(b)) <= 0)
6365621Sdcs#define DN_KEY_GT(a,b)     ((int64_t)((a)-(b)) > 0)
6461376Sdcs#define DN_KEY_GEQ(a,b)    ((int64_t)((a)-(b)) >= 0)
6561376Sdcs#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
6661376Sdcs#define MY_M	16 /* number of left shift to obtain a larger precision */
6761376Sdcs
6861376Sdcs/*
6961376Sdcs * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the
7044603Sdcs * virtual time wraps every 15 days.
7165621Sdcs */
7261376Sdcs
7361376Sdcs/*
7461376Sdcs * The OFFSET_OF macro is used to return the offset of a field within
7565621Sdcs * a structure. It is used by the heap management routines.
7661376Sdcs */
7761376Sdcs#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) )
7861376Sdcs
7961376Sdcs/*
8061376Sdcs * A heap entry is made of a key and a pointer to the actual
8161376Sdcs * object stored in the heap.
8261376Sdcs * The heap is an array of dn_heap_entry entries, dynamically allocated.
8361376Sdcs * Current size is "size", with "elements" actually in use.
8461376Sdcs * The heap normally supports only ordered insert and extract from the top.
8561376Sdcs * If we want to extract an object from the middle of the heap, we
8644603Sdcs * have to know where the object itself is located in the heap (or we
8765621Sdcs * need to scan the whole array). To this purpose, an object has a
8865621Sdcs * field (int) which contains the index of the object itself into the
8965621Sdcs * heap. When the object is moved, the field must also be updated.
9065621Sdcs * The offset of the index in the object is stored in the 'offset'
9165621Sdcs * field in the heap descriptor. The assumption is that this offset
9265621Sdcs * is non-zero if we want to support extract from the middle.
9365621Sdcs */
9465621Sdcsstruct dn_heap_entry {
9565621Sdcs    dn_key key ;	/* sorting key. Topmost element is smallest one */
9665621Sdcs    void *object ;	/* object pointer */
9761376Sdcs} ;
9865621Sdcs
9965621Sdcsstruct dn_heap {
10065621Sdcs    int size ;
10165621Sdcs    int elements ;
10265621Sdcs    int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */
10365621Sdcs    struct dn_heap_entry *p ;	/* really an array of "size" entries */
10465621Sdcs} ;
10565621Sdcs
10665621Sdcs/*
10765621Sdcs * MT_DUMMYNET is a new (fake) mbuf type that is prepended to the
10861376Sdcs * packet when it comes out of a pipe. The definition
10965621Sdcs * ought to go in /sys/sys/mbuf.h but here it is less intrusive.
11065621Sdcs */
11165621Sdcs
11265621Sdcs#define MT_DUMMYNET MT_CONTROL
11365621Sdcs
11465621Sdcs/*
11561376Sdcs * struct dn_pkt identifies a packet in the dummynet queue. The
11665621Sdcs * first part is really an m_hdr for implementation purposes, and some
11765621Sdcs * fields are saved there. When passing the packet back to the ip_input/
11865621Sdcs * ip_output()/bdg_forward, the struct is prepended to the mbuf chain with type
11965621Sdcs * MT_DUMMYNET, and contains the pointer to the matching rule.
12061376Sdcs *
12165621Sdcs * Note: there is no real need to make this structure contain an m_hdr,
12265621Sdcs * in the future this should be changed to a normal data structure.
12365621Sdcs */
12465621Sdcsstruct dn_pkt {
12565621Sdcs	struct m_hdr hdr ;
12665621Sdcs#define dn_next	hdr.mh_nextpkt	/* next element in queue */
12765621Sdcs#define DN_NEXT(x)	(struct dn_pkt *)(x)->dn_next
12865621Sdcs#define dn_m	hdr.mh_next	/* packet to be forwarded */
12965621Sdcs#define dn_dir	hdr.mh_flags	/* action when pkt extracted from a queue */
13065621Sdcs#define DN_TO_IP_OUT	1
13165621Sdcs#define DN_TO_IP_IN	2
13265621Sdcs#define DN_TO_BDG_FWD	3
13365621Sdcs
13465621Sdcs	dn_key  output_time;    /* when the pkt is due for delivery */
13565621Sdcs        struct ifnet *ifp;	/* interface, for ip_output		*/
13661376Sdcs	struct sockaddr_in *dn_dst ;
13765621Sdcs        struct route ro;	/* route, for ip_output. MUST COPY	*/
13865621Sdcs	int flags ;		/* flags, for ip_output (IPv6 ?) */
13965621Sdcs};
14065621Sdcs
14161376Sdcs/*
14265621Sdcs * Overall structure of dummynet (with WF2Q+):
14365621Sdcs
14465621SdcsIn dummynet, packets are selected with the firewall rules, and passed
14565621Sdcsto two different objects: PIPE or QUEUE.
14665621Sdcs
14765621SdcsA QUEUE is just a queue with configurable size and queue management
14865621Sdcspolicy. It is also associated with a mask (to discriminate among
14965621Sdcsdifferent flows), a weight (used to give different shares of the
15065621Sdcsbandwidth to different flows) and a "pipe", which essentially
15165621Sdcssupplies the transmit clock for all queues associated with that
15265621Sdcspipe.
15365621Sdcs
15465621SdcsA PIPE emulates a fixed-bandwidth link, whose bandwidth is
15565621Sdcsconfigurable.  The "clock" for a pipe can come from either an
15665621Sdcsinternal timer, or from the transmit interrupt of an interface.
15765621SdcsA pipe is also associated with one (or more, if masks are used)
15865621Sdcsqueue, where all packets for that pipe are stored.
15965621Sdcs
16065621SdcsThe bandwidth available on the pipe is shared by the queues
16165621Sdcsassociated with that pipe (only one in case the packet is sent
16265621Sdcsto a PIPE) according to the WF2Q+ scheduling algorithm and the
16365621Sdcsconfigured weights.
16465621Sdcs
16565621SdcsIn general, incoming packets are stored in the appropriate queue,
16665621Sdcswhich is then placed into one of a few heaps managed by a scheduler
16765621Sdcsto decide when the packet should be extracted.
16865621SdcsThe scheduler (a function called dummynet()) is run at every timer
16965621Sdcstick, and grabs queues from the head of the heaps when they are
17065621Sdcsready for processing.
17165621Sdcs
17265621SdcsThere are three data structures definining a pipe and associated queues:
17365621Sdcs
17465621Sdcs + dn_pipe, which contains the main configuration parameters related
17565621Sdcs   to delay and bandwidth;
17665621Sdcs + dn_flow_set, which contains WF2Q+ configuration, flow
17765621Sdcs   masks, plr and RED configuration;
17865621Sdcs + dn_flow_queue, which is the per-flow queue (containing the packets)
17965621Sdcs
18065621SdcsMultiple dn_flow_set can be linked to the same pipe, and multiple
18165621Sdcsdn_flow_queue can be linked to the same dn_flow_set.
18265621SdcsAll data structures are linked in a linear list which is used for
18365621Sdcshousekeeping purposes.
18465621Sdcs
18565621SdcsDuring configuration, we create and initialize the dn_flow_set
18665621Sdcsand dn_pipe structures (a dn_pipe also contains a dn_flow_set).
18761376Sdcs
18861376SdcsAt runtime: packets are sent to the appropriate dn_flow_set (either
18961376SdcsWFQ ones, or the one embedded in the dn_pipe for fixed-rate flows),
19061376Sdcswhich in turn dispatches them to the appropriate dn_flow_queue
19161376Sdcs(created dynamically according to the masks).
19261376Sdcs
19361376SdcsThe transmit clock for fixed rate flows (ready_event()) selects the
19461376Sdcsdn_flow_queue to be used to transmit the next packet. For WF2Q,
19561376Sdcswfq_ready_event() extract a pipe which in turn selects the right
19661376Sdcsflow using a number of heaps defined into the pipe itself.
19765621Sdcs
19861376Sdcs *
19961376Sdcs */
20061376Sdcs
20161376Sdcs/*
20261376Sdcs * per flow queue. This contains the flow identifier, the queue
20361376Sdcs * of packets, counters, and parameters used to support both RED and
20461376Sdcs * WF2Q+.
20561376Sdcs */
20661376Sdcsstruct dn_flow_queue {
20761376Sdcs    struct dn_flow_queue *next ;
20861376Sdcs    struct ipfw_flow_id id ;
20961376Sdcs    struct dn_pkt *head, *tail ;	/* queue of packets */
21061376Sdcs    u_int len ;
21161376Sdcs    u_int len_bytes ;
21261376Sdcs    long numbytes ;		/* credit for transmission (dynamic queues) */
21361376Sdcs
21461376Sdcs    u_int64_t tot_pkts ;	/* statistics counters	*/
21561376Sdcs    u_int64_t tot_bytes ;
21661376Sdcs    u_int32_t drops ;
21761376Sdcs    int hash_slot ;	/* debugging/diagnostic */
21861376Sdcs
21961376Sdcs    /* RED parameters */
22065621Sdcs    int avg ;                   /* average queue length est. (scaled) */
22161376Sdcs    int count ;                 /* arrivals since last RED drop */
22261376Sdcs    int random ;                /* random value (scaled) */
22361376Sdcs    u_int32_t q_time ;          /* start of queue idle time */
22461376Sdcs
22561376Sdcs    /* WF2Q+ support */
22661376Sdcs    struct dn_flow_set *fs ; /* parent flow set */
22761376Sdcs    int blh_pos ;	/* position in backlogged_heap */
22861376Sdcs    dn_key sched_time ; /* current time when queue enters ready_heap */
22961376Sdcs
23061376Sdcs    dn_key S,F ; /* start-time, finishing time */
23161376Sdcs    /* setting F < S means the timestamp is invalid. We only need
23261376Sdcs     * to test this when the queue is empty.
23361376Sdcs     */
23461376Sdcs} ;
23561376Sdcs
23665621Sdcs/*
23765621Sdcs * flow_set descriptor. Contains the "template" parameters for the
23861376Sdcs * queue configuration, and pointers to the hash table of dn_flow_queue's.
23961376Sdcs *
24061376Sdcs * The hash table is an array of lists -- we identify the slot by
24161376Sdcs * hashing the flow-id, then scan the list looking for a match.
24261376Sdcs * The size of the hash table (buckets) is configurable on a per-queue
24361376Sdcs * basis.
24461376Sdcs */
24561376Sdcsstruct dn_flow_set {
24661376Sdcs    struct dn_flow_set *next; /* next flow set in all_flow_sets list */
24761376Sdcs
24865621Sdcs    u_short fs_nr ;             /* flow_set number       */
24961376Sdcs    u_short flags_fs;
25061376Sdcs#define DN_HAVE_FLOW_MASK	0x0001
25161376Sdcs#define DN_IS_PIPE		0x4000
25261376Sdcs#define DN_IS_QUEUE		0x8000
25361376Sdcs#define DN_IS_RED		0x0002
25461376Sdcs#define DN_IS_GENTLE_RED	0x0004
25561376Sdcs#define DN_QSIZE_IS_BYTES	0x0008	/* queue measured in bytes */
25665621Sdcs
25765621Sdcs    struct dn_pipe *pipe ;		/* pointer to parent pipe */
25861376Sdcs    u_short parent_nr ;		/* parent pipe#, 0 if local to a pipe */
25961376Sdcs
26065621Sdcs    int weight ; /* WFQ queue weight */
26165621Sdcs    int qsize ;		/* queue size in slots or bytes */
26265621Sdcs    int plr ;           /* pkt loss rate (2^31-1 means 100%) */
26365621Sdcs
26465621Sdcs    struct ipfw_flow_id flow_mask ;
26565621Sdcs    /* hash table of queues onto this flow_set */
26665621Sdcs    int rq_size ;		/* number of slots */
26765621Sdcs    int rq_elements ;		/* active elements */
26865621Sdcs    struct dn_flow_queue **rq;	/* array of rq_size entries */
26965621Sdcs    u_int32_t last_expired ;	/* do not expire too frequently */
27065621Sdcs	/* XXX some RED parameters as well ? */
27165621Sdcs    int backlogged ;		/* #active queues for this flowset */
27265621Sdcs
27365621Sdcs        /* RED parameters */
27461376Sdcs#define SCALE_RED               16
27561376Sdcs#define SCALE(x)                ( (x) << SCALE_RED )
27661376Sdcs#define SCALE_VAL(x)            ( (x) >> SCALE_RED )
27753672Sdcs#define SCALE_MUL(x,y)          ( ( (x) * (y) ) >> SCALE_RED )
27853672Sdcs    int w_q ;               /* queue weight (scaled) */
27953672Sdcs    int max_th ;            /* maximum threshold for queue (scaled) */
28053672Sdcs    int min_th ;            /* minimum threshold for queue (scaled) */
28153672Sdcs    int max_p ;             /* maximum value for p_b (scaled) */
28253672Sdcs    u_int c_1 ;             /* max_p/(max_th-min_th) (scaled) */
28353672Sdcs    u_int c_2 ;             /* max_p*min_th/(max_th-min_th) (scaled) */
28453672Sdcs    u_int c_3 ;             /* for GRED, (1-max_p)/max_th (scaled) */
28553672Sdcs    u_int c_4 ;             /* for GRED, 1 - 2*max_p (scaled) */
28653672Sdcs    u_int * w_q_lookup ;    /* lookup table for computing (1-w_q)^t */
28753672Sdcs    u_int lookup_depth ;    /* depth of lookup table */
28853672Sdcs    int lookup_step ;       /* granularity inside the lookup table */
28953672Sdcs    int lookup_weight ;     /* equal to (1-w_q)^t / (1-w_q)^(t+1) */
29053672Sdcs    int avg_pkt_size ;      /* medium packet size */
29153672Sdcs    int max_pkt_size ;      /* max packet size */
29253672Sdcs} ;
29353672Sdcs
29453672Sdcs/*
29553672Sdcs * Pipe descriptor. Contains global parameters, delay-line queue,
29653672Sdcs * and the flow_set used for fixed-rate queues.
29753672Sdcs *
29853672Sdcs * For WF2Q support it also has 4 heaps holding dn_flow_queue:
29953672Sdcs *   not_eligible_heap, for queues whose start time is higher
30053672Sdcs *	than the virtual time. Sorted by start time.
30144603Sdcs *   scheduler_heap, for queues eligible for scheduling. Sorted by
30244603Sdcs *	finish time.
30344603Sdcs *   backlogged_heap, all flows in the two heaps above, sorted by
30444603Sdcs *	start time. This is used to compute the virtual time.
30544603Sdcs *   idle_heap, all flows that are idle and can be removed. We
30644603Sdcs *	do that on each tick so we do not slow down too much
30744603Sdcs *	operations during forwarding.
30844603Sdcs *
30944603Sdcs */
31044603Sdcsstruct dn_pipe {			/* a pipe */
31144603Sdcs	struct dn_pipe *next ;
31244603Sdcs
31344603Sdcs    int	pipe_nr ;		/* number	*/
31444603Sdcs	int	bandwidth;		/* really, bytes/tick.	*/
31544603Sdcs	int	delay ;			/* really, ticks	*/
31644603Sdcs
31744603Sdcs    struct	dn_pkt *head, *tail ;	/* packets in delay line */
31847198Sdcs
31947198Sdcs    /* WF2Q+ */
32047198Sdcs    struct dn_heap scheduler_heap ; /* top extract - key Finish time*/
32147198Sdcs    struct dn_heap not_eligible_heap; /* top extract- key Start time */
32247198Sdcs    struct dn_heap backlogged_heap ; /* random extract - key Start time */
32347198Sdcs    struct dn_heap idle_heap ; /* random extract - key Start=Finish time */
32447198Sdcs
32547198Sdcs    dn_key V ; /* virtual time */
32647198Sdcs    int sum;	/* sum of weights of all active sessions */
32747198Sdcs    int numbytes;	/* bit i can transmit (more or less). */
32847198Sdcs
32947198Sdcs    dn_key sched_time ; /* first time pipe is scheduled in ready_heap */
33044603Sdcs
33144603Sdcs    /* the tx clock can come from an interface. In this case, the
33244603Sdcs     * name is below, and the pointer is filled when the rule is
33344603Sdcs     * configured. We identify this by setting the if_name to a
33444603Sdcs     * non-empty string.
33544603Sdcs     */
33644603Sdcs    char if_name[16];
33744603Sdcs    struct ifnet *ifp ;
33844603Sdcs    int ready ; /* set if ifp != NULL and we got a signal from it */
33944603Sdcs
34044603Sdcs    struct dn_flow_set fs ; /* used with fixed-rate flows */
34144603Sdcs};
34244603Sdcs
34344603Sdcs#ifdef _KERNEL
34444603Sdcs
34544603SdcsMALLOC_DECLARE(M_IPFW);
34644603Sdcs
34744603Sdcstypedef int ip_dn_ctl_t __P((struct sockopt *)) ;
34844603Sdcsextern ip_dn_ctl_t *ip_dn_ctl_ptr;
34944603Sdcs
35044603Sdcsvoid dn_rule_delete(void *r);		/* used in ip_fw.c */
35146005Sdcsint dummynet_io(int pipe, int dir,
35246005Sdcs	struct mbuf *m, struct ifnet *ifp, struct route *ro,
35346005Sdcs	struct sockaddr_in * dst,
35446005Sdcs	struct ip_fw_chain *rule, int flags);
35546005Sdcs#endif
35646005Sdcs
35746005Sdcs#endif /* _IP_DUMMYNET_H */
35846005Sdcs