1/*
2 * CoDel - The Controlled-Delay Active Queue Management algorithm
3 *
4 *  Copyright (C) 2013 Ermal Luçi <eri@FreeBSD.org>
5 *  Copyright (C) 2011-2012 Kathleen Nichols <nichols@pollere.com>
6 *  Copyright (C) 2011-2012 Van Jacobson <van@pollere.net>
7 *  Copyright (C) 2012 Michael D. Taht <dave.taht@bufferbloat.net>
8 *  Copyright (C) 2012 Eric Dumazet <edumazet@google.com>
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions, and the following disclaimer,
15 *    without modification.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 *    notice, this list of conditions and the following disclaimer in the
18 *    documentation and/or other materials provided with the distribution.
19 * 3. The names of the authors may not be used to endorse or promote products
20 *    derived from this software without specific prior written permission.
21 *
22 * Alternatively, provided that this notice is retained in full, this
23 * software may be distributed under the terms of the GNU General
24 * Public License ("GPL") version 2, in which case the provisions of the
25 * GPL apply INSTEAD OF those given above.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
38 * DAMAGE.
39 *
40 * $FreeBSD$
41 */
42#include "opt_altq.h"
43#include "opt_inet.h"
44#include "opt_inet6.h"
45
46#ifdef ALTQ_CODEL  /* CoDel is enabled by ALTQ_CODEL option in opt_altq.h */
47
48#include <sys/param.h>
49#include <sys/malloc.h>
50#include <sys/mbuf.h>
51#include <sys/socket.h>
52#include <sys/systm.h>
53
54#include <net/if.h>
55#include <net/if_var.h>
56#include <netinet/in.h>
57
58#include <netpfil/pf/pf.h>
59#include <netpfil/pf/pf_altq.h>
60#include <net/altq/if_altq.h>
61#include <net/altq/altq.h>
62#include <net/altq/altq_codel.h>
63
64static int		 codel_should_drop(struct codel *, class_queue_t *,
65			    struct mbuf *, u_int64_t);
66static void		 codel_Newton_step(struct codel_vars *);
67static u_int64_t	 codel_control_law(u_int64_t t, u_int64_t, u_int32_t);
68
69#define	codel_time_after(a, b)		((int64_t)(a) - (int64_t)(b) > 0)
70#define	codel_time_after_eq(a, b)	((int64_t)(a) - (int64_t)(b) >= 0)
71#define	codel_time_before(a, b)		((int64_t)(a) - (int64_t)(b) < 0)
72#define	codel_time_before_eq(a, b)	((int64_t)(a) - (int64_t)(b) <= 0)
73
74static int codel_request(struct ifaltq *, int, void *);
75
76static int codel_enqueue(struct ifaltq *, struct mbuf *, struct altq_pktattr *);
77static struct mbuf *codel_dequeue(struct ifaltq *, int);
78
79int
80codel_pfattach(struct pf_altq *a)
81{
82	struct ifnet *ifp;
83
84	if ((ifp = ifunit(a->ifname)) == NULL || a->altq_disc == NULL)
85		return (EINVAL);
86
87	return (altq_attach(&ifp->if_snd, ALTQT_CODEL, a->altq_disc,
88	    codel_enqueue, codel_dequeue, codel_request, NULL, NULL));
89}
90
91int
92codel_add_altq(struct pf_altq *a)
93{
94	struct codel_if	*cif;
95	struct ifnet	*ifp;
96	struct codel_opts	*opts;
97
98	if ((ifp = ifunit(a->ifname)) == NULL)
99		return (EINVAL);
100	if (!ALTQ_IS_READY(&ifp->if_snd))
101		return (ENODEV);
102
103	opts = &a->pq_u.codel_opts;
104
105	cif = malloc(sizeof(struct codel_if), M_DEVBUF, M_NOWAIT | M_ZERO);
106	if (cif == NULL)
107		return (ENOMEM);
108	cif->cif_bandwidth = a->ifbandwidth;
109	cif->cif_ifq = &ifp->if_snd;
110
111	cif->cl_q = malloc(sizeof(class_queue_t), M_DEVBUF, M_NOWAIT | M_ZERO);
112	if (cif->cl_q == NULL) {
113		free(cif, M_DEVBUF);
114		return (ENOMEM);
115	}
116
117	if (a->qlimit == 0)
118		a->qlimit = 50;	/* use default. */
119	qlimit(cif->cl_q) = a->qlimit;
120	qtype(cif->cl_q) = Q_CODEL;
121	qlen(cif->cl_q) = 0;
122	qsize(cif->cl_q) = 0;
123
124	if (opts->target == 0)
125		opts->target = 5;
126	if (opts->interval == 0)
127		opts->interval = 100;
128	cif->codel.params.target = machclk_freq * opts->target / 1000;
129	cif->codel.params.interval = machclk_freq * opts->interval / 1000;
130	cif->codel.params.ecn = opts->ecn;
131	cif->codel.stats.maxpacket = 256;
132
133	cif->cl_stats.qlength = qlen(cif->cl_q);
134	cif->cl_stats.qlimit = qlimit(cif->cl_q);
135
136	/* keep the state in pf_altq */
137	a->altq_disc = cif;
138
139	return (0);
140}
141
142int
143codel_remove_altq(struct pf_altq *a)
144{
145	struct codel_if *cif;
146
147	if ((cif = a->altq_disc) == NULL)
148		return (EINVAL);
149	a->altq_disc = NULL;
150
151	if (cif->cl_q)
152		free(cif->cl_q, M_DEVBUF);
153	free(cif, M_DEVBUF);
154
155	return (0);
156}
157
158int
159codel_getqstats(struct pf_altq *a, void *ubuf, int *nbytes)
160{
161	struct codel_if *cif;
162	struct codel_ifstats stats;
163	int error = 0;
164
165	if ((cif = altq_lookup(a->ifname, ALTQT_CODEL)) == NULL)
166		return (EBADF);
167
168	if (*nbytes < sizeof(stats))
169		return (EINVAL);
170
171	stats = cif->cl_stats;
172	stats.stats = cif->codel.stats;
173
174	if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
175		return (error);
176	*nbytes = sizeof(stats);
177
178	return (0);
179}
180
181static int
182codel_request(struct ifaltq *ifq, int req, void *arg)
183{
184	struct codel_if	*cif = (struct codel_if *)ifq->altq_disc;
185	struct mbuf *m;
186
187	IFQ_LOCK_ASSERT(ifq);
188
189	switch (req) {
190	case ALTRQ_PURGE:
191		if (!ALTQ_IS_ENABLED(cif->cif_ifq))
192			break;
193
194		if (qempty(cif->cl_q))
195			break;
196
197		while ((m = _getq(cif->cl_q)) != NULL) {
198			PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m));
199			m_freem(m);
200			IFQ_DEC_LEN(cif->cif_ifq);
201		}
202		cif->cif_ifq->ifq_len = 0;
203		break;
204	}
205
206	return (0);
207}
208
209static int
210codel_enqueue(struct ifaltq *ifq, struct mbuf *m, struct altq_pktattr *pktattr)
211{
212
213	struct codel_if *cif = (struct codel_if *) ifq->altq_disc;
214
215	IFQ_LOCK_ASSERT(ifq);
216
217	/* grab class set by classifier */
218	if ((m->m_flags & M_PKTHDR) == 0) {
219		/* should not happen */
220		printf("altq: packet for %s does not have pkthdr\n",
221		   ifq->altq_ifp->if_xname);
222		m_freem(m);
223		PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m));
224		return (ENOBUFS);
225	}
226
227	if (codel_addq(&cif->codel, cif->cl_q, m)) {
228		PKTCNTR_ADD(&cif->cl_stats.cl_dropcnt, m_pktlen(m));
229		return (ENOBUFS);
230	}
231	IFQ_INC_LEN(ifq);
232
233	return (0);
234}
235
236static struct mbuf *
237codel_dequeue(struct ifaltq *ifq, int op)
238{
239	struct codel_if *cif = (struct codel_if *)ifq->altq_disc;
240	struct mbuf *m;
241
242	IFQ_LOCK_ASSERT(ifq);
243
244	if (IFQ_IS_EMPTY(ifq))
245		return (NULL);
246
247	if (op == ALTDQ_POLL)
248		return (qhead(cif->cl_q));
249
250
251	m = codel_getq(&cif->codel, cif->cl_q);
252	if (m != NULL) {
253		IFQ_DEC_LEN(ifq);
254		PKTCNTR_ADD(&cif->cl_stats.cl_xmitcnt, m_pktlen(m));
255		return (m);
256	}
257
258	return (NULL);
259}
260
261struct codel *
262codel_alloc(int target, int interval, int ecn)
263{
264	struct codel *c;
265
266	c = malloc(sizeof(*c), M_DEVBUF, M_NOWAIT | M_ZERO);
267	if (c != NULL) {
268		c->params.target = machclk_freq * target / 1000;
269		c->params.interval = machclk_freq * interval / 1000;
270		c->params.ecn = ecn;
271		c->stats.maxpacket = 256;
272	}
273
274	return (c);
275}
276
277void
278codel_destroy(struct codel *c)
279{
280
281	free(c, M_DEVBUF);
282}
283
284#define	MTAG_CODEL	1438031249
285int
286codel_addq(struct codel *c, class_queue_t *q, struct mbuf *m)
287{
288	struct m_tag *mtag;
289	uint64_t *enqueue_time;
290
291	if (qlen(q) < qlimit(q)) {
292		mtag = m_tag_locate(m, MTAG_CODEL, 0, NULL);
293		if (mtag == NULL)
294			mtag = m_tag_alloc(MTAG_CODEL, 0, sizeof(uint64_t),
295			    M_NOWAIT);
296		if (mtag == NULL) {
297			m_freem(m);
298			return (-1);
299		}
300		enqueue_time = (uint64_t *)(mtag + 1);
301		*enqueue_time = read_machclk();
302		m_tag_prepend(m, mtag);
303		_addq(q, m);
304		return (0);
305	}
306	c->drop_overlimit++;
307	m_freem(m);
308
309	return (-1);
310}
311
312static int
313codel_should_drop(struct codel *c, class_queue_t *q, struct mbuf *m,
314    u_int64_t now)
315{
316	struct m_tag *mtag;
317	uint64_t *enqueue_time;
318
319	if (m == NULL) {
320		c->vars.first_above_time = 0;
321		return (0);
322	}
323
324	mtag = m_tag_locate(m, MTAG_CODEL, 0, NULL);
325	if (mtag == NULL) {
326		/* Only one warning per second. */
327		if (ppsratecheck(&c->last_log, &c->last_pps, 1))
328			printf("%s: could not found the packet mtag!\n",
329			    __func__);
330		c->vars.first_above_time = 0;
331		return (0);
332	}
333	enqueue_time = (uint64_t *)(mtag + 1);
334	c->vars.ldelay = now - *enqueue_time;
335	c->stats.maxpacket = MAX(c->stats.maxpacket, m_pktlen(m));
336
337	if (codel_time_before(c->vars.ldelay, c->params.target) ||
338	    qsize(q) <= c->stats.maxpacket) {
339		/* went below - stay below for at least interval */
340		c->vars.first_above_time = 0;
341		return (0);
342	}
343	if (c->vars.first_above_time == 0) {
344		/* just went above from below. If we stay above
345		 * for at least interval we'll say it's ok to drop
346		 */
347		c->vars.first_above_time = now + c->params.interval;
348		return (0);
349	}
350	if (codel_time_after(now, c->vars.first_above_time))
351		return (1);
352
353	return (0);
354}
355
356/*
357 * Run a Newton method step:
358 * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
359 *
360 * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
361 */
362static void
363codel_Newton_step(struct codel_vars *vars)
364{
365	uint32_t invsqrt, invsqrt2;
366	uint64_t val;
367
368/* sizeof_in_bits(rec_inv_sqrt) */
369#define	REC_INV_SQRT_BITS (8 * sizeof(u_int16_t))
370/* needed shift to get a Q0.32 number from rec_inv_sqrt */
371#define	REC_INV_SQRT_SHIFT (32 - REC_INV_SQRT_BITS)
372
373	invsqrt = ((u_int32_t)vars->rec_inv_sqrt) << REC_INV_SQRT_SHIFT;
374	invsqrt2 = ((u_int64_t)invsqrt * invsqrt) >> 32;
375	val = (3LL << 32) - ((u_int64_t)vars->count * invsqrt2);
376	val >>= 2; /* avoid overflow in following multiply */
377	val = (val * invsqrt) >> (32 - 2 + 1);
378
379	vars->rec_inv_sqrt = val >> REC_INV_SQRT_SHIFT;
380}
381
382static u_int64_t
383codel_control_law(u_int64_t t, u_int64_t interval, u_int32_t rec_inv_sqrt)
384{
385
386	return (t + (u_int32_t)(((u_int64_t)interval *
387	    (rec_inv_sqrt << REC_INV_SQRT_SHIFT)) >> 32));
388}
389
390struct mbuf *
391codel_getq(struct codel *c, class_queue_t *q)
392{
393	struct mbuf	*m;
394	u_int64_t	 now;
395	int		 drop;
396
397	if ((m = _getq(q)) == NULL) {
398		c->vars.dropping = 0;
399		return (m);
400	}
401
402	now = read_machclk();
403	drop = codel_should_drop(c, q, m, now);
404	if (c->vars.dropping) {
405		if (!drop) {
406			/* sojourn time below target - leave dropping state */
407			c->vars.dropping = 0;
408		} else if (codel_time_after_eq(now, c->vars.drop_next)) {
409			/* It's time for the next drop. Drop the current
410			 * packet and dequeue the next. The dequeue might
411			 * take us out of dropping state.
412			 * If not, schedule the next drop.
413			 * A large backlog might result in drop rates so high
414			 * that the next drop should happen now,
415			 * hence the while loop.
416			 */
417			while (c->vars.dropping &&
418			    codel_time_after_eq(now, c->vars.drop_next)) {
419				c->vars.count++; /* don't care of possible wrap
420						  * since there is no more
421						  * divide */
422				codel_Newton_step(&c->vars);
423				/* TODO ECN */
424				PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m));
425				m_freem(m);
426				m = _getq(q);
427				if (!codel_should_drop(c, q, m, now))
428					/* leave dropping state */
429					c->vars.dropping = 0;
430				else
431					/* and schedule the next drop */
432					c->vars.drop_next =
433					    codel_control_law(c->vars.drop_next,
434						c->params.interval,
435						c->vars.rec_inv_sqrt);
436			}
437		}
438	} else if (drop) {
439		/* TODO ECN */
440		PKTCNTR_ADD(&c->stats.drop_cnt, m_pktlen(m));
441		m_freem(m);
442
443		m = _getq(q);
444		drop = codel_should_drop(c, q, m, now);
445
446		c->vars.dropping = 1;
447		/* if min went above target close to when we last went below it
448		 * assume that the drop rate that controlled the queue on the
449		 * last cycle is a good starting point to control it now.
450		 */
451		if (codel_time_before(now - c->vars.drop_next,
452		    16 * c->params.interval)) {
453			c->vars.count = (c->vars.count - c->vars.lastcount) | 1;
454			/* we dont care if rec_inv_sqrt approximation
455			 * is not very precise :
456			 * Next Newton steps will correct it quadratically.
457			 */
458			codel_Newton_step(&c->vars);
459		} else {
460			c->vars.count = 1;
461			c->vars.rec_inv_sqrt = ~0U >> REC_INV_SQRT_SHIFT;
462		}
463		c->vars.lastcount = c->vars.count;
464		c->vars.drop_next = codel_control_law(now, c->params.interval,
465		    c->vars.rec_inv_sqrt);
466	}
467
468	return (m);
469}
470
471void
472codel_getstats(struct codel *c, struct codel_stats *s)
473{
474	*s = c->stats;
475}
476
477#endif /* ALTQ_CODEL */
478