ip_dummynet.h revision 71137
144603Sdcs/* 244603Sdcs * Copyright (c) 1998-2000 Luigi Rizzo, Universita` di Pisa 344603Sdcs * Portions Copyright (c) 2000 Akamba Corp. 444603Sdcs * All rights reserved 544603Sdcs * 644603Sdcs * Redistribution and use in source and binary forms, with or without 744603Sdcs * modification, are permitted provided that the following conditions 844603Sdcs * are met: 944603Sdcs * 1. Redistributions of source code must retain the above copyright 1044603Sdcs * notice, this list of conditions and the following disclaimer. 1144603Sdcs * 2. Redistributions in binary form must reproduce the above copyright 1244603Sdcs * notice, this list of conditions and the following disclaimer in the 1344603Sdcs * documentation and/or other materials provided with the distribution. 1444603Sdcs * 1544603Sdcs * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 1644603Sdcs * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1744603Sdcs * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1844603Sdcs * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 1944603Sdcs * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 2044603Sdcs * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 2144603Sdcs * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 2244603Sdcs * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 2344603Sdcs * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 2444603Sdcs * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 2550477Speter * SUCH DAMAGE. 2644603Sdcs * 2761694Sdcs * $FreeBSD: head/sys/netinet/ip_dummynet.h 71137 2001-01-17 01:09:40Z luigi $ 2861694Sdcs */ 2961694Sdcs 3061694Sdcs#ifndef _IP_DUMMYNET_H 3161694Sdcs#define _IP_DUMMYNET_H 3261694Sdcs 3361694Sdcs/* 3461694Sdcs * Definition of dummynet data structures. In the structures, I decided 3561694Sdcs * not to use the macros in <sys/queue.h> in the hope of making the code 3661694Sdcs * easier to port to other architectures. The type of lists and queue we 3761694Sdcs * use here is pretty simple anyways. 3861379Sdcs */ 3961694Sdcs 4061694Sdcs/* 4161694Sdcs * We start with a heap, which is used in the scheduler to decide when 4261694Sdcs * to transmit packets etc. 4361694Sdcs * 4461694Sdcs * The key for the heap is used for two different values: 4561694Sdcs * 4661694Sdcs * 1. timer ticks- max 10K/second, so 32 bits are enough; 4761694Sdcs * 4861694Sdcs * 2. virtual times. These increase in steps of len/x, where len is the 4961694Sdcs * packet length, and x is either the weight of the flow, or the 5061379Sdcs * sum of all weights. 5144603Sdcs * If we limit to max 1000 flows and a max weight of 100, then 5244603Sdcs * x needs 17 bits. The packet size is 16 bits, so we can easily 5344603Sdcs * overflow if we do not allow errors. 5444603Sdcs * So we use a key "dn_key" which is 64 bits. Some macros are used to 5544603Sdcs * compare key values and handle wraparounds. 5644603Sdcs * MAX64 returns the largest of two key values. 5744603Sdcs * MY_M is used as a shift count when doing fixed point arithmetic 5844603Sdcs * (a better name would be useful...). 5961376Sdcs */ 6061376Sdcstypedef u_int64_t dn_key ; /* sorting key */ 6161376Sdcs#define DN_KEY_LT(a,b) ((int64_t)((a)-(b)) < 0) 6261376Sdcs#define DN_KEY_LEQ(a,b) ((int64_t)((a)-(b)) <= 0) 6365621Sdcs#define DN_KEY_GT(a,b) ((int64_t)((a)-(b)) > 0) 6461376Sdcs#define DN_KEY_GEQ(a,b) ((int64_t)((a)-(b)) >= 0) 6561376Sdcs#define MAX64(x,y) (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x) 6661376Sdcs#define MY_M 16 /* number of left shift to obtain a larger precision */ 6761376Sdcs 6861376Sdcs/* 6961376Sdcs * XXX With this scaling, max 1000 flows, max weight 100, 1Gbit/s, the 7044603Sdcs * virtual time wraps every 15 days. 7165621Sdcs */ 7261376Sdcs 7361376Sdcs/* 7461376Sdcs * The OFFSET_OF macro is used to return the offset of a field within 7565621Sdcs * a structure. It is used by the heap management routines. 7661376Sdcs */ 7761376Sdcs#define OFFSET_OF(type, field) ((int)&( ((type *)0)->field) ) 7861376Sdcs 7961376Sdcs/* 8061376Sdcs * A heap entry is made of a key and a pointer to the actual 8161376Sdcs * object stored in the heap. 8261376Sdcs * The heap is an array of dn_heap_entry entries, dynamically allocated. 8361376Sdcs * Current size is "size", with "elements" actually in use. 8461376Sdcs * The heap normally supports only ordered insert and extract from the top. 8561376Sdcs * If we want to extract an object from the middle of the heap, we 8644603Sdcs * have to know where the object itself is located in the heap (or we 8765621Sdcs * need to scan the whole array). To this purpose, an object has a 8865621Sdcs * field (int) which contains the index of the object itself into the 8965621Sdcs * heap. When the object is moved, the field must also be updated. 9065621Sdcs * The offset of the index in the object is stored in the 'offset' 9165621Sdcs * field in the heap descriptor. The assumption is that this offset 9265621Sdcs * is non-zero if we want to support extract from the middle. 9365621Sdcs */ 9465621Sdcsstruct dn_heap_entry { 9565621Sdcs dn_key key ; /* sorting key. Topmost element is smallest one */ 9665621Sdcs void *object ; /* object pointer */ 9761376Sdcs} ; 9865621Sdcs 9965621Sdcsstruct dn_heap { 10065621Sdcs int size ; 10165621Sdcs int elements ; 10265621Sdcs int offset ; /* XXX if > 0 this is the offset of direct ptr to obj */ 10365621Sdcs struct dn_heap_entry *p ; /* really an array of "size" entries */ 10465621Sdcs} ; 10565621Sdcs 10665621Sdcs/* 10765621Sdcs * MT_DUMMYNET is a new (fake) mbuf type that is prepended to the 10861376Sdcs * packet when it comes out of a pipe. The definition 10965621Sdcs * ought to go in /sys/sys/mbuf.h but here it is less intrusive. 11065621Sdcs */ 11165621Sdcs 11265621Sdcs#define MT_DUMMYNET MT_CONTROL 11365621Sdcs 11465621Sdcs/* 11561376Sdcs * struct dn_pkt identifies a packet in the dummynet queue. The 11665621Sdcs * first part is really an m_hdr for implementation purposes, and some 11765621Sdcs * fields are saved there. When passing the packet back to the ip_input/ 11865621Sdcs * ip_output()/bdg_forward, the struct is prepended to the mbuf chain with type 11965621Sdcs * MT_DUMMYNET, and contains the pointer to the matching rule. 12061376Sdcs * 12165621Sdcs * Note: there is no real need to make this structure contain an m_hdr, 12265621Sdcs * in the future this should be changed to a normal data structure. 12365621Sdcs */ 12465621Sdcsstruct dn_pkt { 12565621Sdcs struct m_hdr hdr ; 12665621Sdcs#define dn_next hdr.mh_nextpkt /* next element in queue */ 12765621Sdcs#define DN_NEXT(x) (struct dn_pkt *)(x)->dn_next 12865621Sdcs#define dn_m hdr.mh_next /* packet to be forwarded */ 12965621Sdcs#define dn_dir hdr.mh_flags /* action when pkt extracted from a queue */ 13065621Sdcs#define DN_TO_IP_OUT 1 13165621Sdcs#define DN_TO_IP_IN 2 13265621Sdcs#define DN_TO_BDG_FWD 3 13365621Sdcs 13465621Sdcs dn_key output_time; /* when the pkt is due for delivery */ 13565621Sdcs struct ifnet *ifp; /* interface, for ip_output */ 13661376Sdcs struct sockaddr_in *dn_dst ; 13765621Sdcs struct route ro; /* route, for ip_output. MUST COPY */ 13865621Sdcs int flags ; /* flags, for ip_output (IPv6 ?) */ 13965621Sdcs}; 14065621Sdcs 14161376Sdcs/* 14265621Sdcs * Overall structure of dummynet (with WF2Q+): 14365621Sdcs 14465621SdcsIn dummynet, packets are selected with the firewall rules, and passed 14565621Sdcsto two different objects: PIPE or QUEUE. 14665621Sdcs 14765621SdcsA QUEUE is just a queue with configurable size and queue management 14865621Sdcspolicy. It is also associated with a mask (to discriminate among 14965621Sdcsdifferent flows), a weight (used to give different shares of the 15065621Sdcsbandwidth to different flows) and a "pipe", which essentially 15165621Sdcssupplies the transmit clock for all queues associated with that 15265621Sdcspipe. 15365621Sdcs 15465621SdcsA PIPE emulates a fixed-bandwidth link, whose bandwidth is 15565621Sdcsconfigurable. The "clock" for a pipe can come from either an 15665621Sdcsinternal timer, or from the transmit interrupt of an interface. 15765621SdcsA pipe is also associated with one (or more, if masks are used) 15865621Sdcsqueue, where all packets for that pipe are stored. 15965621Sdcs 16065621SdcsThe bandwidth available on the pipe is shared by the queues 16165621Sdcsassociated with that pipe (only one in case the packet is sent 16265621Sdcsto a PIPE) according to the WF2Q+ scheduling algorithm and the 16365621Sdcsconfigured weights. 16465621Sdcs 16565621SdcsIn general, incoming packets are stored in the appropriate queue, 16665621Sdcswhich is then placed into one of a few heaps managed by a scheduler 16765621Sdcsto decide when the packet should be extracted. 16865621SdcsThe scheduler (a function called dummynet()) is run at every timer 16965621Sdcstick, and grabs queues from the head of the heaps when they are 17065621Sdcsready for processing. 17165621Sdcs 17265621SdcsThere are three data structures definining a pipe and associated queues: 17365621Sdcs 17465621Sdcs + dn_pipe, which contains the main configuration parameters related 17565621Sdcs to delay and bandwidth; 17665621Sdcs + dn_flow_set, which contains WF2Q+ configuration, flow 17765621Sdcs masks, plr and RED configuration; 17865621Sdcs + dn_flow_queue, which is the per-flow queue (containing the packets) 17965621Sdcs 18065621SdcsMultiple dn_flow_set can be linked to the same pipe, and multiple 18165621Sdcsdn_flow_queue can be linked to the same dn_flow_set. 18265621SdcsAll data structures are linked in a linear list which is used for 18365621Sdcshousekeeping purposes. 18465621Sdcs 18565621SdcsDuring configuration, we create and initialize the dn_flow_set 18665621Sdcsand dn_pipe structures (a dn_pipe also contains a dn_flow_set). 18761376Sdcs 18861376SdcsAt runtime: packets are sent to the appropriate dn_flow_set (either 18961376SdcsWFQ ones, or the one embedded in the dn_pipe for fixed-rate flows), 19061376Sdcswhich in turn dispatches them to the appropriate dn_flow_queue 19161376Sdcs(created dynamically according to the masks). 19261376Sdcs 19361376SdcsThe transmit clock for fixed rate flows (ready_event()) selects the 19461376Sdcsdn_flow_queue to be used to transmit the next packet. For WF2Q, 19561376Sdcswfq_ready_event() extract a pipe which in turn selects the right 19661376Sdcsflow using a number of heaps defined into the pipe itself. 19765621Sdcs 19861376Sdcs * 19961376Sdcs */ 20061376Sdcs 20161376Sdcs/* 20261376Sdcs * per flow queue. This contains the flow identifier, the queue 20361376Sdcs * of packets, counters, and parameters used to support both RED and 20461376Sdcs * WF2Q+. 20561376Sdcs */ 20661376Sdcsstruct dn_flow_queue { 20761376Sdcs struct dn_flow_queue *next ; 20861376Sdcs struct ipfw_flow_id id ; 20961376Sdcs struct dn_pkt *head, *tail ; /* queue of packets */ 21061376Sdcs u_int len ; 21161376Sdcs u_int len_bytes ; 21261376Sdcs long numbytes ; /* credit for transmission (dynamic queues) */ 21361376Sdcs 21461376Sdcs u_int64_t tot_pkts ; /* statistics counters */ 21561376Sdcs u_int64_t tot_bytes ; 21661376Sdcs u_int32_t drops ; 21761376Sdcs int hash_slot ; /* debugging/diagnostic */ 21861376Sdcs 21961376Sdcs /* RED parameters */ 22065621Sdcs int avg ; /* average queue length est. (scaled) */ 22161376Sdcs int count ; /* arrivals since last RED drop */ 22261376Sdcs int random ; /* random value (scaled) */ 22361376Sdcs u_int32_t q_time ; /* start of queue idle time */ 22461376Sdcs 22561376Sdcs /* WF2Q+ support */ 22661376Sdcs struct dn_flow_set *fs ; /* parent flow set */ 22761376Sdcs int blh_pos ; /* position in backlogged_heap */ 22861376Sdcs dn_key sched_time ; /* current time when queue enters ready_heap */ 22961376Sdcs 23061376Sdcs dn_key S,F ; /* start-time, finishing time */ 23161376Sdcs /* setting F < S means the timestamp is invalid. We only need 23261376Sdcs * to test this when the queue is empty. 23361376Sdcs */ 23461376Sdcs} ; 23561376Sdcs 23665621Sdcs/* 23765621Sdcs * flow_set descriptor. Contains the "template" parameters for the 23861376Sdcs * queue configuration, and pointers to the hash table of dn_flow_queue's. 23961376Sdcs * 24061376Sdcs * The hash table is an array of lists -- we identify the slot by 24161376Sdcs * hashing the flow-id, then scan the list looking for a match. 24261376Sdcs * The size of the hash table (buckets) is configurable on a per-queue 24361376Sdcs * basis. 24461376Sdcs */ 24561376Sdcsstruct dn_flow_set { 24661376Sdcs struct dn_flow_set *next; /* next flow set in all_flow_sets list */ 24761376Sdcs 24865621Sdcs u_short fs_nr ; /* flow_set number */ 24961376Sdcs u_short flags_fs; 25061376Sdcs#define DN_HAVE_FLOW_MASK 0x0001 25161376Sdcs#define DN_IS_PIPE 0x4000 25261376Sdcs#define DN_IS_QUEUE 0x8000 25361376Sdcs#define DN_IS_RED 0x0002 25461376Sdcs#define DN_IS_GENTLE_RED 0x0004 25561376Sdcs#define DN_QSIZE_IS_BYTES 0x0008 /* queue measured in bytes */ 25665621Sdcs 25765621Sdcs struct dn_pipe *pipe ; /* pointer to parent pipe */ 25861376Sdcs u_short parent_nr ; /* parent pipe#, 0 if local to a pipe */ 25961376Sdcs 26065621Sdcs int weight ; /* WFQ queue weight */ 26165621Sdcs int qsize ; /* queue size in slots or bytes */ 26265621Sdcs int plr ; /* pkt loss rate (2^31-1 means 100%) */ 26365621Sdcs 26465621Sdcs struct ipfw_flow_id flow_mask ; 26565621Sdcs /* hash table of queues onto this flow_set */ 26665621Sdcs int rq_size ; /* number of slots */ 26765621Sdcs int rq_elements ; /* active elements */ 26865621Sdcs struct dn_flow_queue **rq; /* array of rq_size entries */ 26965621Sdcs u_int32_t last_expired ; /* do not expire too frequently */ 27065621Sdcs /* XXX some RED parameters as well ? */ 27165621Sdcs int backlogged ; /* #active queues for this flowset */ 27265621Sdcs 27365621Sdcs /* RED parameters */ 27461376Sdcs#define SCALE_RED 16 27561376Sdcs#define SCALE(x) ( (x) << SCALE_RED ) 27661376Sdcs#define SCALE_VAL(x) ( (x) >> SCALE_RED ) 27753672Sdcs#define SCALE_MUL(x,y) ( ( (x) * (y) ) >> SCALE_RED ) 27853672Sdcs int w_q ; /* queue weight (scaled) */ 27953672Sdcs int max_th ; /* maximum threshold for queue (scaled) */ 28053672Sdcs int min_th ; /* minimum threshold for queue (scaled) */ 28153672Sdcs int max_p ; /* maximum value for p_b (scaled) */ 28253672Sdcs u_int c_1 ; /* max_p/(max_th-min_th) (scaled) */ 28353672Sdcs u_int c_2 ; /* max_p*min_th/(max_th-min_th) (scaled) */ 28453672Sdcs u_int c_3 ; /* for GRED, (1-max_p)/max_th (scaled) */ 28553672Sdcs u_int c_4 ; /* for GRED, 1 - 2*max_p (scaled) */ 28653672Sdcs u_int * w_q_lookup ; /* lookup table for computing (1-w_q)^t */ 28753672Sdcs u_int lookup_depth ; /* depth of lookup table */ 28853672Sdcs int lookup_step ; /* granularity inside the lookup table */ 28953672Sdcs int lookup_weight ; /* equal to (1-w_q)^t / (1-w_q)^(t+1) */ 29053672Sdcs int avg_pkt_size ; /* medium packet size */ 29153672Sdcs int max_pkt_size ; /* max packet size */ 29253672Sdcs} ; 29353672Sdcs 29453672Sdcs/* 29553672Sdcs * Pipe descriptor. Contains global parameters, delay-line queue, 29653672Sdcs * and the flow_set used for fixed-rate queues. 29753672Sdcs * 29853672Sdcs * For WF2Q support it also has 4 heaps holding dn_flow_queue: 29953672Sdcs * not_eligible_heap, for queues whose start time is higher 30053672Sdcs * than the virtual time. Sorted by start time. 30144603Sdcs * scheduler_heap, for queues eligible for scheduling. Sorted by 30244603Sdcs * finish time. 30344603Sdcs * backlogged_heap, all flows in the two heaps above, sorted by 30444603Sdcs * start time. This is used to compute the virtual time. 30544603Sdcs * idle_heap, all flows that are idle and can be removed. We 30644603Sdcs * do that on each tick so we do not slow down too much 30744603Sdcs * operations during forwarding. 30844603Sdcs * 30944603Sdcs */ 31044603Sdcsstruct dn_pipe { /* a pipe */ 31144603Sdcs struct dn_pipe *next ; 31244603Sdcs 31344603Sdcs int pipe_nr ; /* number */ 31444603Sdcs int bandwidth; /* really, bytes/tick. */ 31544603Sdcs int delay ; /* really, ticks */ 31644603Sdcs 31744603Sdcs struct dn_pkt *head, *tail ; /* packets in delay line */ 31847198Sdcs 31947198Sdcs /* WF2Q+ */ 32047198Sdcs struct dn_heap scheduler_heap ; /* top extract - key Finish time*/ 32147198Sdcs struct dn_heap not_eligible_heap; /* top extract- key Start time */ 32247198Sdcs struct dn_heap backlogged_heap ; /* random extract - key Start time */ 32347198Sdcs struct dn_heap idle_heap ; /* random extract - key Start=Finish time */ 32447198Sdcs 32547198Sdcs dn_key V ; /* virtual time */ 32647198Sdcs int sum; /* sum of weights of all active sessions */ 32747198Sdcs int numbytes; /* bit i can transmit (more or less). */ 32847198Sdcs 32947198Sdcs dn_key sched_time ; /* first time pipe is scheduled in ready_heap */ 33044603Sdcs 33144603Sdcs /* the tx clock can come from an interface. In this case, the 33244603Sdcs * name is below, and the pointer is filled when the rule is 33344603Sdcs * configured. We identify this by setting the if_name to a 33444603Sdcs * non-empty string. 33544603Sdcs */ 33644603Sdcs char if_name[16]; 33744603Sdcs struct ifnet *ifp ; 33844603Sdcs int ready ; /* set if ifp != NULL and we got a signal from it */ 33944603Sdcs 34044603Sdcs struct dn_flow_set fs ; /* used with fixed-rate flows */ 34144603Sdcs}; 34244603Sdcs 34344603Sdcs#ifdef _KERNEL 34444603Sdcs 34544603SdcsMALLOC_DECLARE(M_IPFW); 34644603Sdcs 34744603Sdcstypedef int ip_dn_ctl_t __P((struct sockopt *)) ; 34844603Sdcsextern ip_dn_ctl_t *ip_dn_ctl_ptr; 34944603Sdcs 35044603Sdcsvoid dn_rule_delete(void *r); /* used in ip_fw.c */ 35146005Sdcsint dummynet_io(int pipe, int dir, 35246005Sdcs struct mbuf *m, struct ifnet *ifp, struct route *ro, 35346005Sdcs struct sockaddr_in * dst, 35446005Sdcs struct ip_fw_chain *rule, int flags); 35546005Sdcs#endif 35646005Sdcs 35746005Sdcs#endif /* _IP_DUMMYNET_H */ 35846005Sdcs