1204591Sluigi/*
2204591Sluigi * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
3204591Sluigi * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
4204591Sluigi * All rights reserved
5204591Sluigi *
6204591Sluigi * Redistribution and use in source and binary forms, with or without
7204591Sluigi * modification, are permitted provided that the following conditions
8204591Sluigi * are met:
9204591Sluigi * 1. Redistributions of source code must retain the above copyright
10204591Sluigi *    notice, this list of conditions and the following disclaimer.
11204591Sluigi * 2. Redistributions in binary form must reproduce the above copyright
12204591Sluigi *    notice, this list of conditions and the following disclaimer in the
13204591Sluigi *    documentation and/or other materials provided with the distribution.
14204591Sluigi *
15204591Sluigi * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16204591Sluigi * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17204591Sluigi * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18204591Sluigi * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19204591Sluigi * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20204591Sluigi * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21204591Sluigi * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22204591Sluigi * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23204591Sluigi * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24204591Sluigi * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25204591Sluigi * SUCH DAMAGE.
26204591Sluigi */
27204591Sluigi
28204591Sluigi/*
29204591Sluigi * $FreeBSD: stable/11/sys/netpfil/ipfw/dn_sched_wf2q.c 325730 2017-11-12 01:26:43Z truckman $
30204591Sluigi */
31204591Sluigi
32204591Sluigi#ifdef _KERNEL
33204591Sluigi#include <sys/malloc.h>
34204591Sluigi#include <sys/socket.h>
35204591Sluigi#include <sys/socketvar.h>
36204591Sluigi#include <sys/kernel.h>
37325730Struckman#include <sys/lock.h>
38204591Sluigi#include <sys/mbuf.h>
39204591Sluigi#include <sys/module.h>
40325730Struckman#include <sys/rwlock.h>
41204591Sluigi#include <net/if.h>	/* IFNAMSIZ */
42204591Sluigi#include <netinet/in.h>
43204591Sluigi#include <netinet/ip_var.h>		/* ipfw_rule_ref */
44204591Sluigi#include <netinet/ip_fw.h>	/* flow_id */
45204591Sluigi#include <netinet/ip_dummynet.h>
46325730Struckman#include <netpfil/ipfw/ip_fw_private.h>
47240494Sglebius#include <netpfil/ipfw/dn_heap.h>
48240494Sglebius#include <netpfil/ipfw/ip_dn_private.h>
49300779Struckman#ifdef NEW_AQM
50300779Struckman#include <netpfil/ipfw/dn_aqm.h>
51300779Struckman#endif
52240494Sglebius#include <netpfil/ipfw/dn_sched.h>
53204591Sluigi#else
54204591Sluigi#include <dn_test.h>
55204591Sluigi#endif
56204591Sluigi
57204591Sluigi#ifndef MAX64
58204591Sluigi#define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
59204591Sluigi#endif
60204591Sluigi
61204591Sluigi/*
62204591Sluigi * timestamps are computed on 64 bit using fixed point arithmetic.
63204591Sluigi * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
64204591Sluigi * and sum of weights, respectively. FRAC_BITS is the number of
65204591Sluigi * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
66204591Sluigi * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
67204591Sluigi * using an unsigned 32-bit division, and to avoid wraparounds we need
68204591Sluigi * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
69204591Sluigi * As an example
70204591Sluigi * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
71204591Sluigi */
72204591Sluigi#ifndef FRAC_BITS
73204591Sluigi#define FRAC_BITS    28 /* shift for fixed point arithmetic */
74204591Sluigi#define	ONE_FP	(1UL << FRAC_BITS)
75204591Sluigi#endif
76204591Sluigi
77204591Sluigi/*
78204591Sluigi * Private information for the scheduler instance:
79204591Sluigi * sch_heap (key is Finish time) returns the next queue to serve
80204591Sluigi * ne_heap (key is Start time) stores not-eligible queues
81204591Sluigi * idle_heap (key=start/finish time) stores idle flows. It must
82204591Sluigi *	support extract-from-middle.
83204591Sluigi * A flow is only in 1 of the three heaps.
84204591Sluigi * XXX todo: use a more efficient data structure, e.g. a tree sorted
85204591Sluigi * by F with min_subtree(S) in each node
86204591Sluigi */
87204591Sluigistruct wf2qp_si {
88204591Sluigi    struct dn_heap sch_heap;	/* top extract - key Finish  time */
89204591Sluigi    struct dn_heap ne_heap;	/* top extract - key Start   time */
90204591Sluigi    struct dn_heap idle_heap;	/* random extract - key Start=Finish time */
91204591Sluigi    uint64_t V;			/* virtual time */
92204591Sluigi    uint32_t inv_wsum;		/* inverse of sum of weights */
93204591Sluigi    uint32_t wsum;		/* sum of weights */
94204591Sluigi};
95204591Sluigi
96204591Sluigistruct wf2qp_queue {
97204591Sluigi    struct dn_queue _q;
98204591Sluigi    uint64_t S, F;		/* start time, finish time */
99204591Sluigi    uint32_t inv_w;		/* ONE_FP / weight */
100204591Sluigi    int32_t heap_pos;		/* position (index) of struct in heap */
101204591Sluigi};
102204591Sluigi
103204591Sluigi/*
104204591Sluigi * This file implements a WF2Q+ scheduler as it has been in dummynet
105204591Sluigi * since 2000.
106204591Sluigi * The scheduler supports per-flow queues and has O(log N) complexity.
107204591Sluigi *
108204591Sluigi * WF2Q+ needs to drain entries from the idle heap so that we
109204591Sluigi * can keep the sum of weights up to date. We can do it whenever
110204591Sluigi * we get a chance, or periodically, or following some other
111204591Sluigi * strategy. The function idle_check() drains at most N elements
112204591Sluigi * from the idle heap.
113204591Sluigi */
114204591Sluigistatic void
115204591Sluigiidle_check(struct wf2qp_si *si, int n, int force)
116204591Sluigi{
117204591Sluigi    struct dn_heap *h = &si->idle_heap;
118204591Sluigi    while (n-- > 0 && h->elements > 0 &&
119204591Sluigi		(force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
120204591Sluigi	struct dn_queue *q = HEAP_TOP(h)->object;
121204591Sluigi        struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
122204591Sluigi
123204591Sluigi        heap_extract(h, NULL);
124204591Sluigi        /* XXX to let the flowset delete the queue we should
125204591Sluigi	 * mark it as 'unused' by the scheduler.
126204591Sluigi	 */
127204591Sluigi        alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
128204591Sluigi        si->wsum -= q->fs->fs.par[0];	/* adjust sum of weights */
129204591Sluigi	if (si->wsum > 0)
130204591Sluigi		si->inv_wsum = ONE_FP/si->wsum;
131204591Sluigi    }
132204591Sluigi}
133204591Sluigi
134206845Sluigistatic int
135204591Sluigiwf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
136204591Sluigi{
137204591Sluigi    struct dn_fsk *fs = q->fs;
138204591Sluigi    struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
139204591Sluigi    struct wf2qp_queue *alg_fq;
140204591Sluigi    uint64_t len = m->m_pkthdr.len;
141204591Sluigi
142204591Sluigi    if (m != q->mq.head) {
143204591Sluigi	if (dn_enqueue(q, m, 0)) /* packet was dropped */
144204591Sluigi	    return 1;
145204591Sluigi	if (m != q->mq.head)	/* queue was already busy */
146204591Sluigi	    return 0;
147204591Sluigi    }
148204591Sluigi
149206845Sluigi    /* If reach this point, queue q was idle */
150204591Sluigi    alg_fq = (struct wf2qp_queue *)q;
151204591Sluigi
152204591Sluigi    if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
153204591Sluigi        /* F<S means timestamps are invalid ->brand new queue. */
154204591Sluigi        alg_fq->S = si->V;		/* init start time */
155204591Sluigi        si->wsum += fs->fs.par[0];	/* add weight of new queue. */
156204591Sluigi	si->inv_wsum = ONE_FP/si->wsum;
157204591Sluigi    } else { /* if it was idle then it was in the idle heap */
158204591Sluigi        heap_extract(&si->idle_heap, q);
159204591Sluigi        alg_fq->S = MAX64(alg_fq->F, si->V);	/* compute new S */
160204591Sluigi    }
161204591Sluigi    alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
162204591Sluigi
163204591Sluigi    /* if nothing is backlogged, make sure this flow is eligible */
164204591Sluigi    if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
165204591Sluigi        si->V = MAX64(alg_fq->S, si->V);
166204591Sluigi
167204591Sluigi    /*
168204591Sluigi     * Look at eligibility. A flow is not eligibile if S>V (when
169204591Sluigi     * this happens, it means that there is some other flow already
170204591Sluigi     * scheduled for the same pipe, so the sch_heap cannot be
171204591Sluigi     * empty). If the flow is not eligible we just store it in the
172204591Sluigi     * ne_heap. Otherwise, we store in the sch_heap.
173204591Sluigi     * Note that for all flows in sch_heap (SCH), S_i <= V,
174204591Sluigi     * and for all flows in ne_heap (NEH), S_i > V.
175204591Sluigi     * So when we need to compute max(V, min(S_i)) forall i in
176204591Sluigi     * SCH+NEH, we only need to look into NEH.
177204591Sluigi     */
178204591Sluigi    if (DN_KEY_LT(si->V, alg_fq->S)) {
179204591Sluigi        /* S>V means flow Not eligible. */
180204591Sluigi        if (si->sch_heap.elements == 0)
181204591Sluigi            D("++ ouch! not eligible but empty scheduler!");
182204591Sluigi        heap_insert(&si->ne_heap, alg_fq->S, q);
183204591Sluigi    } else {
184204591Sluigi        heap_insert(&si->sch_heap, alg_fq->F, q);
185204591Sluigi    }
186204591Sluigi    return 0;
187204591Sluigi}
188204591Sluigi
189204591Sluigi/* XXX invariant: sch > 0 || V >= min(S in neh) */
190204591Sluigistatic struct mbuf *
191204591Sluigiwf2qp_dequeue(struct dn_sch_inst *_si)
192204591Sluigi{
193204591Sluigi	/* Access scheduler instance private data */
194204591Sluigi	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
195204591Sluigi	struct mbuf *m;
196204591Sluigi	struct dn_queue *q;
197204591Sluigi	struct dn_heap *sch = &si->sch_heap;
198204591Sluigi	struct dn_heap *neh = &si->ne_heap;
199204591Sluigi	struct wf2qp_queue *alg_fq;
200204591Sluigi
201204591Sluigi	if (sch->elements == 0 && neh->elements == 0) {
202204591Sluigi		/* we have nothing to do. We could kill the idle heap
203204591Sluigi		 * altogether and reset V
204204591Sluigi		 */
205204591Sluigi		idle_check(si, 0x7fffffff, 1);
206204591Sluigi		si->V = 0;
207204591Sluigi		si->wsum = 0;	/* should be set already */
208204591Sluigi		return NULL;	/* quick return if nothing to do */
209204591Sluigi	}
210204591Sluigi	idle_check(si, 1, 0);	/* drain something from the idle heap */
211204591Sluigi
212204591Sluigi	/* make sure at least one element is eligible, bumping V
213204591Sluigi	 * and moving entries that have become eligible.
214204591Sluigi	 * We need to repeat the first part twice, before and
215204591Sluigi	 * after extracting the candidate, or enqueue() will
216204591Sluigi	 * find the data structure in a wrong state.
217204591Sluigi	 */
218204591Sluigi  m = NULL;
219204591Sluigi  for(;;) {
220204591Sluigi	/*
221204591Sluigi	 * Compute V = max(V, min(S_i)). Remember that all elements
222204591Sluigi	 * in sch have by definition S_i <= V so if sch is not empty,
223204591Sluigi	 * V is surely the max and we must not update it. Conversely,
224204591Sluigi	 * if sch is empty we only need to look at neh.
225204591Sluigi	 * We don't need to move the queues, as it will be done at the
226204591Sluigi	 * next enqueue
227204591Sluigi	 */
228204591Sluigi	if (sch->elements == 0 && neh->elements > 0) {
229204591Sluigi		si->V = MAX64(si->V, HEAP_TOP(neh)->key);
230204591Sluigi	}
231204591Sluigi	while (neh->elements > 0 &&
232204591Sluigi		    DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
233204591Sluigi		q = HEAP_TOP(neh)->object;
234204591Sluigi		alg_fq = (struct wf2qp_queue *)q;
235204591Sluigi		heap_extract(neh, NULL);
236204591Sluigi		heap_insert(sch, alg_fq->F, q);
237204591Sluigi	}
238204591Sluigi	if (m) /* pkt found in previous iteration */
239204591Sluigi		break;
240204591Sluigi	/* ok we have at least one eligible pkt */
241204591Sluigi	q = HEAP_TOP(sch)->object;
242204591Sluigi	alg_fq = (struct wf2qp_queue *)q;
243204591Sluigi	m = dn_dequeue(q);
244204591Sluigi	heap_extract(sch, NULL); /* Remove queue from heap. */
245204591Sluigi	si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
246204591Sluigi	alg_fq->S = alg_fq->F;  /* Update start time. */
247204591Sluigi	if (q->mq.head == 0) {	/* not backlogged any more. */
248204591Sluigi		heap_insert(&si->idle_heap, alg_fq->F, q);
249204591Sluigi	} else {			/* Still backlogged. */
250204591Sluigi		/* Update F, store in neh or sch */
251204591Sluigi		uint64_t len = q->mq.head->m_pkthdr.len;
252204591Sluigi		alg_fq->F += len * alg_fq->inv_w;
253204591Sluigi		if (DN_KEY_LEQ(alg_fq->S, si->V)) {
254204591Sluigi			heap_insert(sch, alg_fq->F, q);
255204591Sluigi		} else {
256204591Sluigi			heap_insert(neh, alg_fq->S, q);
257204591Sluigi		}
258204591Sluigi	}
259204591Sluigi    }
260204591Sluigi	return m;
261204591Sluigi}
262204591Sluigi
263204591Sluigistatic int
264204591Sluigiwf2qp_new_sched(struct dn_sch_inst *_si)
265204591Sluigi{
266204591Sluigi	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
267204591Sluigi	int ofs = offsetof(struct wf2qp_queue, heap_pos);
268204591Sluigi
269204591Sluigi	/* all heaps support extract from middle */
270204591Sluigi	if (heap_init(&si->idle_heap, 16, ofs) ||
271204591Sluigi	    heap_init(&si->sch_heap, 16, ofs) ||
272204591Sluigi	    heap_init(&si->ne_heap, 16, ofs)) {
273204591Sluigi		heap_free(&si->ne_heap);
274204591Sluigi		heap_free(&si->sch_heap);
275204591Sluigi		heap_free(&si->idle_heap);
276204591Sluigi		return ENOMEM;
277204591Sluigi	}
278204591Sluigi	return 0;
279204591Sluigi}
280204591Sluigi
281204591Sluigistatic int
282204591Sluigiwf2qp_free_sched(struct dn_sch_inst *_si)
283204591Sluigi{
284204591Sluigi	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
285204591Sluigi
286204591Sluigi	heap_free(&si->sch_heap);
287204591Sluigi	heap_free(&si->ne_heap);
288204591Sluigi	heap_free(&si->idle_heap);
289204591Sluigi
290204591Sluigi	return 0;
291204591Sluigi}
292204591Sluigi
293204591Sluigistatic int
294204591Sluigiwf2qp_new_fsk(struct dn_fsk *fs)
295204591Sluigi{
296204591Sluigi	ipdn_bound_var(&fs->fs.par[0], 1,
297204591Sluigi		1, 100, "WF2Q+ weight");
298204591Sluigi	return 0;
299204591Sluigi}
300204591Sluigi
301204591Sluigistatic int
302204591Sluigiwf2qp_new_queue(struct dn_queue *_q)
303204591Sluigi{
304204591Sluigi	struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
305204591Sluigi
306204591Sluigi	_q->ni.oid.subtype = DN_SCHED_WF2QP;
307204591Sluigi	q->F = 0;	/* not strictly necessary */
308204591Sluigi	q->S = q->F + 1;    /* mark timestamp as invalid. */
309204591Sluigi        q->inv_w = ONE_FP / _q->fs->fs.par[0];
310204591Sluigi	if (_q->mq.head != NULL) {
311204591Sluigi		wf2qp_enqueue(_q->_si, _q, _q->mq.head);
312204591Sluigi	}
313204591Sluigi	return 0;
314204591Sluigi}
315204591Sluigi
316204591Sluigi/*
317204591Sluigi * Called when the infrastructure removes a queue (e.g. flowset
318204591Sluigi * is reconfigured). Nothing to do if we did not 'own' the queue,
319204591Sluigi * otherwise remove it from the right heap and adjust the sum
320204591Sluigi * of weights.
321204591Sluigi */
322204591Sluigistatic int
323204591Sluigiwf2qp_free_queue(struct dn_queue *q)
324204591Sluigi{
325204591Sluigi	struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
326204591Sluigi	struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
327213267Sluigi
328204591Sluigi	if (alg_fq->S >= alg_fq->F + 1)
329204591Sluigi		return 0;	/* nothing to do, not in any heap */
330204591Sluigi	si->wsum -= q->fs->fs.par[0];
331204591Sluigi	if (si->wsum > 0)
332204591Sluigi		si->inv_wsum = ONE_FP/si->wsum;
333204591Sluigi
334204591Sluigi	/* extract from the heap. XXX TODO we may need to adjust V
335204591Sluigi	 * to make sure the invariants hold.
336204591Sluigi	 */
337204591Sluigi	if (q->mq.head == NULL) {
338204591Sluigi		heap_extract(&si->idle_heap, q);
339204591Sluigi	} else if (DN_KEY_LT(si->V, alg_fq->S)) {
340204591Sluigi		heap_extract(&si->ne_heap, q);
341204591Sluigi	} else {
342204591Sluigi		heap_extract(&si->sch_heap, q);
343204591Sluigi	}
344204591Sluigi	return 0;
345204591Sluigi}
346204591Sluigi
347204591Sluigi/*
348204591Sluigi * WF2Q+ scheduler descriptor
349204591Sluigi * contains the type of the scheduler, the name, the size of the
350204591Sluigi * structures and function pointers.
351204591Sluigi */
352204591Sluigistatic struct dn_alg wf2qp_desc = {
353204591Sluigi	_SI( .type = ) DN_SCHED_WF2QP,
354204591Sluigi	_SI( .name = ) "WF2Q+",
355204591Sluigi	_SI( .flags = ) DN_MULTIQUEUE,
356204591Sluigi
357204591Sluigi	/* we need extra space in the si and the queue */
358204591Sluigi	_SI( .schk_datalen = ) 0,
359204591Sluigi	_SI( .si_datalen = ) sizeof(struct wf2qp_si),
360204591Sluigi	_SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
361204591Sluigi				sizeof(struct dn_queue),
362204591Sluigi
363204591Sluigi	_SI( .enqueue = ) wf2qp_enqueue,
364204591Sluigi	_SI( .dequeue = ) wf2qp_dequeue,
365204591Sluigi
366204591Sluigi	_SI( .config = )  NULL,
367204591Sluigi	_SI( .destroy = )  NULL,
368204591Sluigi	_SI( .new_sched = ) wf2qp_new_sched,
369204591Sluigi	_SI( .free_sched = ) wf2qp_free_sched,
370206845Sluigi
371204591Sluigi	_SI( .new_fsk = ) wf2qp_new_fsk,
372204591Sluigi	_SI( .free_fsk = )  NULL,
373204591Sluigi
374204591Sluigi	_SI( .new_queue = ) wf2qp_new_queue,
375204591Sluigi	_SI( .free_queue = ) wf2qp_free_queue,
376300779Struckman#ifdef NEW_AQM
377300779Struckman	_SI( .getconfig = )  NULL,
378300779Struckman#endif
379300779Struckman
380204591Sluigi};
381204591Sluigi
382204591Sluigi
383204591SluigiDECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
384